安装scrapy
a. pip3 install wheel
b. 下载twisted http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
c. 进入下载目录,执行 pip3 install Twisted-xxxxx.whl
d. pip3 install scrapy -i http://pypi.douban.com/simple
--trusted-host pypi.douban.com
e. pip3 install pywin32 -i http://pypi.douban.com/simple
--trusted-host pypi.douban.com
创建爬虫项目
scrapy startproject safly
cd safly
scrapy gensipider chouti chouti.com
关于编码问题
import sys,os
sys.stdout=io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030')
代码:
# -*- coding: utf-8 -*-
import scrapy
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://chouti.com/']
def parse(self, response):
print(response.text)
通过scrapy crawl chouti --nolog
测试
项目结构以及爬虫应用简介
project_name/
scrapy.cfg
project_name/
__init__.py
items.py
pipelines.py
settings.py
spiders/
__init__.py
爬虫1.py
爬虫2.py
爬虫3.py
文件说明:
scrapy.cfg 项目的主配置信息。(真正爬虫相关的配置信息在settings.py文件中)
items.py 设置数据存储模板,用于结构化数据,如:Django的Model
pipelines 数据处理行为,如:一般结构化的数据持久化
settings.py 配置文件,如:递归的层数、并发数,延迟下载等
spiders 爬虫目录,如:创建文件,编写爬虫规则
选择器
from scrapy.selector import Selector, HtmlXPathSelector
from scrapy.http import HtmlResponse
html = """<!DOCTYPE html>
<html>
<head lang="en">
<meta charset="UTF-8">
<title></title>
</head>
<body>
<ul>
<li class="item-"><a id='i1' href="link.html">i1 item</a></li>
<li class="item-0"><a id='i2' href="llink.html">i2 item</a></li>
<li class="item-1"><a href="llink2.html">second item<span>vv</span></a></li>
</ul>
<div><a href="llink2.html">second item</a></div>
</body>
</html>
"""
response = HtmlResponse(url='http://example.com', body=html, encoding='utf-8')
# 找寻文件中所有的a标签
hxs = Selector(response=response).xpath('//a')
print(hxs)
# 选取所有拥有名为 id 的属性的 a 元素。
hxs = Selector(response=response).xpath('//a[@id]')
print(hxs)
# 选取所有 a 元素,且这些元素拥有值为 id 的 i1 属性。
hxs = Selector(response=response).xpath('//a[@id="i1"]')
print(hxs)
# 并且关系
hxs = Selector(response=response).xpath('//a[@href="link.html"][@id="i1"]')
print(hxs)
# 选取所有 a 元素,且这些元素拥有href属性,属性值包含link
hxs = Selector(response=response).xpath('//a[contains(@href, "link")]')
print(hxs)
# 选取所有 a 元素,且这些元素拥有href属性,属性值以link开头
hxs = Selector(response=response).xpath('//a[starts-with(@href, "link")]')
print(hxs)
hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]')
print(hxs)
# #正则匹配 id属性值符合i\d+ 校验规则
hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/text()').extract()
print(hxs)
hxs = Selector(response=response).xpath('//a[re:test(@id, "i\d+")]/@href').extract()
print(hxs)
hxs = Selector(response=response).xpath('/html/body/ul/li/a/@href').extract()
print(hxs)
hxs = Selector(response=response).xpath('//body/ul/li/a/@href').extract_first()
print(hxs)
ul_list = Selector(response=response).xpath('//body/ul/li')
for item in ul_list:
v1 = item.xpath('./a/span')
# 或
v2 = item.xpath('a/span')
# 或
v3 = item.xpath('*/a/span')
# print(v1)
# print(v2)
# print(v3)
输出如下:
扫描二维码关注公众号,回复:
1000091 查看本文章
E:\python\python_sdk\python.exe E:/python/safly/safly/spiders/safly.py
[<Selector xpath='//a' data='<a id="i1" href="link.html">i1 item</a>'>, <Selector xpath='//a' data='<a id="i2" href="llink.html">i2 item</a>'>, <Selector xpath='//a' data='<a href="llink2.html">second item<span>v'>, <Selector xpath='//a' data='<a href="llink2.html">second item</a>'>]
[<Selector xpath='//a[@id]' data='<a id="i1" href="link.html">i1 item</a>'>, <Selector xpath='//a[@id]' data='<a id="i2" href="llink.html">i2 item</a>'>]
[<Selector xpath='//a[@id="i1"]' data='<a id="i1" href="link.html">i1 item</a>'>]
[<Selector xpath='//a[@href="link.html"][@id="i1"]' data='<a id="i1" href="link.html">i1 item</a>'>]
[<Selector xpath='//a[contains(@href, "link")]' data='<a id="i1" href="link.html">i1 item</a>'>, <Selector xpath='//a[contains(@href, "link")]' data='<a id="i2" href="llink.html">i2 item</a>'>, <Selector xpath='//a[contains(@href, "link")]' data='<a href="llink2.html">second item<span>v'>, <Selector xpath='//a[contains(@href, "link")]' data='<a href="llink2.html">second item</a>'>]
[<Selector xpath='//a[starts-with(@href, "link")]' data='<a id="i1" href="link.html">i1 item</a>'>]
[<Selector xpath='//a[re:test(@id, "i\\d+")]' data='<a id="i1" href="link.html">i1 item</a>'>, <Selector xpath='//a[re:test(@id, "i\\d+")]' data='<a id="i2" href="llink.html">i2 item</a>'>]
['i1 item', 'i2 item']
['link.html', 'llink.html']
['link.html', 'llink.html', 'llink2.html']
link.html
Process finished with exit code 0
爬虫案例
chouti.py
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
from scrapy.selector import HtmlXPathSelector
from scrapy.http import Request
from ..items import XianglongItem
class ChoutiSpider(scrapy.Spider):
name = 'chouti'
allowed_domains = ['chouti.com']
start_urls = ['http://dig.chouti.com/',]
def parse(self, response):
"""
当起始URL下载完毕后,自动执行parse函数:response封装了响应相关的所有内容。
:param response:
:return:
"""
hxs = HtmlXPathSelector(response=response)
# 去下载的页面中:找新闻
items = hxs.xpath("//div[@id='content-list']/div[@class='item']")
for item in items:
href = item.xpath('.//div[@class="part1"]//a[1]/@href').extract_first()
text = item.xpath('.//div[@class="part1"]//a[1]/text()').extract_first()
item = XianglongItem(title=text,href=href)
print(href, "-----")
yield item
pages = hxs.xpath('//div[@id="page-area"]//a[@class="ct_pagepa"]/@href').extract()
for page_url in pages:
page_url = "https://dig.chouti.com" + page_url
yield Request(url=page_url,callback=self.parse)
items.py
import scrapy
class XianglongItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
href = scrapy.Field()
pipelines.py
class XianglongPipeline(object):
def process_item(self, item, spider):
self.f.write(item['href']+'\n')
self.f.flush()
return item
def open_spider(self, spider):
"""
爬虫开始执行时,调用
:param spider:
:return:
"""
self.f = open('url.log','w')
def close_spider(self, spider):
"""
爬虫关闭时,被调用
:param spider:
:return:
"""
self.f.close()
settings.py
ITEM_PIPELINES = {
'xianglong.pipelines.XianglongPipeline': 300,
}
DEPTH_LIMIT = 2
url.log
https://www.zhihu.com/question/266252635/answer/344487421
http://dig.chouti.com/pic/show?nid=e7d46c243dc4958d6d1afb030420f381&lid=19446191
https://www.thepaper.cn/newsDetail_forward_2123987
https://www.thepaper.cn/newsDetail_forward_2123704
https://mp.weixin.qq.com/s/vcK5Ew1xCQuc4LlqjNCUFQ
https://dig.chouti.com/user/cdu_44476851294/submitted/1
https://www.miaopai.com/show/nTckVvYDtr1Jgwvg6tKBU8uZ5~fnmVSSbzkzmQ__.htm
https://www.thepaper.cn/newsDetail_forward_2124022
http://dig.chouti.com/pic/show?nid=c4f9cf6ad4bc861f11437ff6b9e2af6b&lid=19444656
http://tech.163.com/18/0510/22/DHFRA37S00097U7R.html
http://www.pearvideo.com/video_1341634
http://dig.chouti.com/pic/show?nid=fe50ff217fba43838eb85b894588a762&lid=19442361
https://weibo.com/tv/v/Gg8DF40SK?fid=1034:1adeec4f87115424b0f493f5f0a6f82c
http://tech.qq.com/a/20180511/021642.htm
https://www.jiemian.com/article/2115446.html
http://world.huanqiu.com/exclusive/2018-05/12013262.html
http://sports.sina.com.cn/g/seriea/2018-05-11/doc-ihamfahw0985498.shtml
http://www.vice.cn/read/the-vice-first-hand-report-hong-kong-is-running-out-of-space-to-bury-its-dead
http://tech.qq.com/a/20180511/021384.htm
https://mp.weixin.qq.com/s/Yko8396TRwzh1EuKi5Ymtw
http://www.pearvideo.com/video_1342215
http://dig.chouti.com/pic/show?nid=83691408087bb54c6b57deb5b6025028&lid=19442846
http://dig.chouti.com/pic/show?nid=78d646578d07ed03e8263a51def41572&lid=19436631
http://user.guancha.cn/main/content?id=16005
http://www.maoqiuapp.com/topic/11aee45e89314f1b93b8826d930d4a1e
https://www.huxiu.com/article/243369.html
http://www.guancha.cn/economy/2018_05_09_456290.shtml
https://www.jiemian.com/article/2120611.html
http://dig.chouti.com/pic/show?nid=b5bb1356ad844edff8b525ea68b17e0b&lid=19401141
http://www.guancha.cn/america/2018_05_09_456264.shtml