1.创建项目:
命令行:scrapy startproject dongguan
cd dongguan
scrapy genspider dg wz.sun0769.com #dg表示实例名,wz.sun0769.com表示爬取链接不能要http
爬取东莞阳光问政网站:url=http://wz.sun0769.com/index.php/question/ruse?page=30
2.在items.py中创建字段
import scrapy class DongguanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() bnum=scrapy.Field() btype=scrapy.Field() btitle=scrapy.Field() burl=scrapy.Field() bcontent=scrapy.Field() bstate=scrapy.Field() bfriend=scrapy.Field() btime=scrapy.Field()
3.在实例dg.py 中做爬取工作
# -*- coding: utf-8 -*- import scrapy import lxml from lxml import etree import requests from dongguan import items from scrapy.spiders import Rule,CrawlSpider#提取规则 from scrapy.linkextractors import LinkExtractor#提取链接 class DgSpider(CrawlSpider):# 继承CrawSpider name = 'dg' allowed_domains = ['wz.sun0769.com'] start_urls = ['http://wz.sun0769.com/index.php/question/ruse?page=30'] # rules爬取规则,不能改变rules名字 # LinkExtractor() 提取规则 allow 允许的连接使用正则匹配 # follow 跟踪 True则跟踪,False则不跟踪 # callback 调用 rules = [Rule(LinkExtractor(allow=("page=(\d+)")),follow=True,callback="get_parse")] def get_parse(self, response): # 这里重写parse,不能再使用parse这个名字,要换成其他名字 total=response.xpath("//table[@bgcolor=\"#FBFEFF\"]//tr") for b in total: item=items.DongguanItem()#实例化对象 bnum = b.xpath("./td[1]/text()")[0].extract() btype = b.xpath("./td[2]/a[1]/text()")[0].extract()[1:-1] btitle = b.xpath("./td[2]/a[2]/text()")[0].extract() burl = b.xpath("./td[2]/a[2]/@href")[0].extract() bcontent=self.get_content(burl) bstate = b.xpath("./td[3]/span/text()")[0].extract() bfriend = b.xpath("./td[4]/text()")[0].extract() btime = b.xpath("./td[5]/text()")[0].extract() item['bnum']=bnum item['btype']=btype item['btitle']=btitle item["burl"]=burl item['bcontent']=bcontent item['bstate']=bstate item['bfriend']=bfriend item['btime']=btime # print(item['bnum'],item['btype'],item['btitle'],item["burl"],item['bcontent'],item['bstate'],item['bfriend'],item['btime'], "+++++++++++++++") yield item #获取内容 def get_content(self,url): header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"} cont = requests.get(url, headers=header) content = cont.content.decode("gb2312", errors='ignore') mytree = lxml.etree.HTML(content) content = mytree.xpath("//div[@class=\"c1 text14_2\"]/text()")[0].replace('\xa0', '').strip() return content
4.在pipelines.py中写存储,这里写入数据库
import pymysql class DongguanPipeline(object): def open_spider(self, spider): self.conn = pymysql.connect(host="localhost", user="root", password="123456", database="script", port=3306, charset='utf8') self.cursor = self.conn.cursor() def process_item(self, item, spider): sql = "insert into dg(bnum ,btype ,btitle,bcontent ,bstate ,bfriend ,btime ) VALUES (%s,%r,%r,%r,%r,%r,%r)" % ( item['bnum'], item['btype'], item['btitle'],item['bcontent'],item['bstate'], item['bfriend'],item['btime']) # print(sql) self.cursor.execute(sql) self.conn.commit()5.在setting.py中解除注释:
ITEM_PIPELINES = { 'dongguan.pipelines.DongguanPipeline': 300, }
同时:将ROBOTSTXT_OBEY = False#默认是True这里改为False,g改变一些规则
6.在项目文件下新建一个start.py,方便启动
import scrapy.cmdline def main(): scrapy.cmdline.execute(["scrapy","crawl","dg"]) if __name__ == '__main__': main()
如果不愿意新建启动,在命令行行输入:scrapy crawl dg 也可以启动