1. pycharm中运行scrapy
- windows环境下cmd中通过scrapy startproject 项目名,创建scrapy项目
- 修改Run…中的Script path为cmdline.py文件路径F:\programs\python\Lib\site-packages\scrapy\cmdline.py
- Parameters为crawl 爬虫文件名
- working directory为scrapy项目所在文件夹
- 每次执行该run命令即可运行scrapy
2.爬虫目标
通过上一篇requests构建的同步爬虫获取页面下所有子链接,本篇通过异步scrapy框架分别爬取各链接的主要内容
https://blog.csdn.net/wxfghy/article/details/80308825
scrapy框架的使用需要修改其自动生成的四个文件settings.py, items.py, pipelines.py 和自定义的爬虫代码mycsdn.py
- 其中settings.py文件的修改因人而异,主要修改其余三个文件
3.items.py
class Csdn02Item(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()#标题
updatetime = scrapy.Field()#发表时间
readcount = scrapy.Field()#阅读数
author = scrapy.Field()#作者
ranking = scrapy.Field()#博客排名
curl = scrapy.Field()#博文链接
context = scrapy.Field()#博文内容
4.pipelines.py
class Csdn02Pipeline(object):
def __init__(self):
# 生成title.txt用于存储除内容外所有内容,分行存储
self.cfile=open('F://demo/title.txt','a',encoding='utf8')
def process_item(self, item, spider):
curl=item['curl']
title = item['title']
updatetime = item['updatetime']
readcount = item['readcount']
author = item['author']
ranking = item['ranking']
context = item['context']
self.cfile.write(f'标题:{title}\t发表时间:{updatetime}\t阅读数:{readcount}\t作者:{author}\t博客排名:{ranking}\t链接地址:{curl}\n')
# 以writelines将列表形式的内容写入.html文件
with open(f'F://demo/{title}.html', 'a', encoding='utf-8') as wl:
wl.writelines(context)
return item
def file_close(self):
# 关闭文件
self.cfile.close()
5.自定义的爬虫代码mycsdn.py
class MycsdnSpider(scrapy.Spider):
name = 'mycsdn'
allowed_domains = ['blog.csdn.net']
#读取由requests爬取的全部链接
file=open('F://demo/urls.txt','r',encoding='utf8')
#readlines读取返回列表
urllist=file.readlines()
start_urls = []
for u in urllist:
#由于写入时以\n分行,读取后会在尾部留有\n导致无法连接,需sub掉
u=re.sub(r'\n','',u)
start_urls.append(u)
def parse(self, response):
#response获取后str转换为字符串,并指定字符集
mbody=response.body
mbody=str(mbody,encoding='utf8')
curl=re.findall(r'(?<=<link rel="canonical" href=").+(?="/>)',mbody)
curl = ''.join(curl)
title = re.findall(r'(?<=<h6 class="title-article">).+(?=</h6>)', mbody)
title = ''.join(title)
#由于windows对文件名的限制,要对标题中特殊字符作修正
title = re.sub(r'\||\s|\\|:', '', title)
updatetime = re.findall(r'(?<=<span class="time">).+(?=</span>)', mbody)
updatetime = ''.join(updatetime)
readcount = re.findall(r'(?<=<span class="read-count">).+(?=</span>)', mbody)
readcount = ''.join(readcount)
author = re.findall(r'(?<=id="uid">).+(?=</a>)', mbody)
author = ''.join(author)
ranking = re.findall(r'(?<=<dd>).{1,10}(?=</dd>)', mbody)
ranking = ''.join(ranking)
context = re.findall(r'(?<=<article>)(?:.|[\r\n])*(?=</article>)', mbody)
#从Items.py导入并实例化Csdn02Item()
po=Csdn02Item()
po['curl'] = curl
po['title'] = title
po['updatetime'] = updatetime
po['readcount'] = readcount
po['author'] = author
po['ranking']=ranking
po['context']=context
#yield返回Csdn02Item()对象
yield po