使用scrapy爬government网站受理信息

1.创建项目:

命令行:scrapy startproject dongguan

cd  dongguan

scrapy genspider dg  wz.sun0769.com #dg表示实例名,wz.sun0769.com表示爬取链接不能要http

爬取东莞阳光问政网站:url=http://wz.sun0769.com/index.php/question/ruse?page=30

2.在items.py中创建字段

import scrapy
class DongguanItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    bnum=scrapy.Field()
    btype=scrapy.Field()
    btitle=scrapy.Field()
    burl=scrapy.Field()
    bcontent=scrapy.Field()
    bstate=scrapy.Field()
    bfriend=scrapy.Field()
    btime=scrapy.Field()

3.在实例dg.py 中做爬取工作

# -*- coding: utf-8 -*-
import scrapy
import lxml
from lxml import etree
import requests
from  dongguan import items
from scrapy.spiders import Rule,CrawlSpider#提取规则
from scrapy.linkextractors import LinkExtractor#提取链接

class DgSpider(CrawlSpider):# 继承CrawSpider
    name = 'dg'
    allowed_domains = ['wz.sun0769.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/ruse?page=30']
    # rules爬取规则,不能改变rules名字
    # LinkExtractor() 提取规则 allow 允许的连接使用正则匹配
    # follow 跟踪 True则跟踪,False则不跟踪
    # callback 调用
    rules = [Rule(LinkExtractor(allow=("page=(\d+)")),follow=True,callback="get_parse")]
    def get_parse(self, response): # 这里重写parse,不能再使用parse这个名字,要换成其他名字
        total=response.xpath("//table[@bgcolor=\"#FBFEFF\"]//tr")
        for b in total:
            item=items.DongguanItem()#实例化对象
            bnum = b.xpath("./td[1]/text()")[0].extract()
            btype = b.xpath("./td[2]/a[1]/text()")[0].extract()[1:-1]
            btitle = b.xpath("./td[2]/a[2]/text()")[0].extract()
            burl = b.xpath("./td[2]/a[2]/@href")[0].extract()
            bcontent=self.get_content(burl)
            bstate = b.xpath("./td[3]/span/text()")[0].extract()
            bfriend = b.xpath("./td[4]/text()")[0].extract()
            btime = b.xpath("./td[5]/text()")[0].extract()

            item['bnum']=bnum
            item['btype']=btype
            item['btitle']=btitle
            item["burl"]=burl
            item['bcontent']=bcontent
            item['bstate']=bstate
            item['bfriend']=bfriend
            item['btime']=btime
            # print(item['bnum'],item['btype'],item['btitle'],item["burl"],item['bcontent'],item['bstate'],item['bfriend'],item['btime'], "+++++++++++++++")
            yield item

    #获取内容
    def get_content(self,url):
        header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}

        cont = requests.get(url, headers=header)
        content = cont.content.decode("gb2312", errors='ignore')
        mytree = lxml.etree.HTML(content)
        content = mytree.xpath("//div[@class=\"c1 text14_2\"]/text()")[0].replace('\xa0', '').strip()
        return content

4.在pipelines.py中写存储,这里写入数据库

import pymysql


class DongguanPipeline(object):
    def open_spider(self, spider):
        self.conn = pymysql.connect(host="localhost", user="root", password="123456",
                                    database="script", port=3306, charset='utf8')
        self.cursor = self.conn.cursor()

    def process_item(self, item, spider):
        sql = "insert into dg(bnum ,btype ,btitle,bcontent ,bstate ,bfriend ,btime ) VALUES (%s,%r,%r,%r,%r,%r,%r)" % (
        item['bnum'], item['btype'], item['btitle'],item['bcontent'],item['bstate'], item['bfriend'],item['btime'])
        # print(sql)
        self.cursor.execute(sql)
        self.conn.commit()
5.在setting.py中解除注释:
ITEM_PIPELINES = {
   'dongguan.pipelines.DongguanPipeline': 300,
}
同时:将ROBOTSTXT_OBEY = False#默认是True这里改为False,g改变一些规则
6.在项目文件下新建一个start.py,方便启动
import scrapy.cmdline
def main():
    scrapy.cmdline.execute(["scrapy","crawl","dg"])

if __name__ == '__main__':
    main()
如果不愿意新建启动,在命令行行输入:scrapy crawl dg  也可以启动


发布了18 篇原创文章 · 获赞 7 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/qq_39965716/article/details/80584780