scrapy爬取《坏蛋是怎样练成的4》

scrapy具体介绍就不用说了,自己百度一下。或者参考以下文档

https://blog.csdn.net/u011054333/article/details/70165401

直接在cmd里运行 

scrapy startproject huaidan

scrapy genspider huaidan huaida4.com

然后贴代码放到spiders文件夹里

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.http import Request
 4 from urllib import parse
 5 import re
 6 
 7 class huaidan(scrapy.Spider):
 8     name = "huaidan"
 9     allowed_domains = ["www.huaidan4.com"]
10     start_urls = ["http://www.huaidan4.com/di-yi-fen-lei.html",
11                   "http://www.huaidan4.com/di-er-juan.html",
12                   "http://www.huaidan4.com"]
13 
14 
15     #提取下一页文章url交给scrpy进行下载
16     def parse(self, response):
17         #获取文章url
18         all_article=response.css('.container ul li a::attr(href)').extract()
19         all_url=[]
20         for article_url in all_article:
21             if article_url in all_url:
22                 pass
23             else:
24                 all_url.append(article_url)
25                 yield Request(url=article_url,encoding='utf-8',callback=self.parse_detail)
26 
27 
28 
29 
30     #提取文章的具体字段
31     def parse_detail(self,response):
32         #获取文章标题
33         article_title = response.xpath('//*[@id="content"]/div[1]/div[1]/h2/text()').extract_first()
34 
35         #获取创建时间
36         create_time = response.xpath('//*[@id="content"]/div[1]/div[1]/span/text()[2]').extract_first().strip()
37 
38         #获取文章正文
39         article_text = response.css('.post_entry,p::text').extract_first()
40         #处理正文标点符号和无用的信息
41         article_text = re.sub('</?\w+[^>]*>','',article_text)
42         article_text = article_text.replace("\', \'","")
43         article_text = article_text.replace("\\u3000","").strip()
44         article_text = article_text.replace("\\xa0\\xa0\\xa0\\xa0","")
45         article_text = article_text.replace("(新书上传,求收藏,推荐!!!!!!!!!!!!!!!!!!!!)","")
46         article_text = article_text.replace("\\r\\n", "\n")
47         article_text = article_text.replace("免费小说", "")
48         article_text = article_text.replace("www.huaidan4.com", "")
49         article_text = article_text.replace("neirong_2();", "")
50         article_text = article_text.replace("dibutuijian();", "")
51         article_text = article_text.replace("◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。", "")
52         article_text = article_text.replace("《坏蛋是怎样炼成的4》是继曹三少坏蛋是怎样炼成的3的又一作品,作者是曹三少,如果你喜欢坏蛋是怎样炼成的4,请收藏本站以便下次阅读。","")
53         article_text = re.sub('/?\s+', '', article_text)
54 
55         #保存文件
56         self.save_article(article_title,create_time,str(article_text))
57 
58     #保存文件的方法
59     def save_article(self,article_title,create_time,article_text):
60         biaoti = re.sub('\W+','-',article_title)
61         with open(biaoti+'.txt','w',encoding='utf-8') as file:
62             neirong = (article_title+'\n'+create_time+'\n'+article_text)
63             file.write(neirong)
64             file.close()

猜你喜欢

转载自www.cnblogs.com/guoyabin/p/9109933.html
今日推荐