scrapy配置以及下载到数据库或者csv文件或者json文件:

普通 srcapy文件:

scrapy startproject 项目名称
srcapy gensiplder 文件名 域名
#如:srcapy gensiplder baidu baidu.com
#会生成一个baidu.py文件
scrapy crawl 名字
#运行文件

另一种方式:

scrapy startproject 项目名称
srcapy gensiplder   -t   crawl   文件名   域名
#如:srcapy gensiplder baidu baidu.com
#会生成一个baidu.py文件
scrapy crawl 名字
#运行文件

用浏览器爬取时的setting文件的配置:

DOWNLOADER_MIDDLEWARES = {
    'Zhilian.middlewares.ZhilianDownloaderMiddleware': 543,
    #这个是下载中间件,如果开启了这个中间件,整个下载器下载的过程,将会经过中间件的过滤
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None,
}

middlewares.py 文件的配置

 def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        print("下载正在进行。。。")
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called

        # 当下载器下载url的时候被调用
        # 拦截下载器的下载,植入selenium+webdriver
        opt = webdriver.ChromeOptions()
        opt.add_argument("--headless")
        driver = webdriver.Chrome(options=opt)
        # 用浏览器发起get请求
        driver.get(request.url)
        sleep(1)
        body = driver.page_source
        # 根据浏览器解析出来的网页源码创建出一个响应对象
        return HtmlResponse(url=driver.current_url, body=body, encoding='utf-8', request=request)

补充:存入数据库或者csv文件或者json文件

import csv
class InterstingPipeline(object):
    def open_spider(self,spider):
        self.csv_file = open("u148.csv",'w',encoding='utf-8')
        self.csvItems = []
    def process_item(self, item, spider):#管道,每次迭代运行一次
        csv_item=[]
        csv_item.append(item["author"])
        csv_item.append(item["title"])
        csv_item.append(item["img"])
        csv_item.append(item["abstract"])
        csv_item.append(item["time"])
        self.csvItems.append(csv_item)
        return item

    def close_spider(self, spider):
        write =csv.writer(self.csv_file)
        write.writerow(["author","title","img","abstract","time"])
        write.writerow(self.csvItems)
        self.csv_file.close()
 def open_spider(self,spider):
        self.zhilian_json = open('zhilian.json','w',encoding='utf-8')
        self.items=[]
    def process_item(self, item, spider):
        self.items.append(dict(item))
        return item
    def close_spider(self,spider):
        self.zhilian_json.write(json.dumps(self.items))
        self.zhilian_json.close()
# def open_spider(self,apider):
    #     self.conn =pymysql.connect(host = '127.0.0.1',port='3306',db='zhilian',user='cy',password = '123456',charset = 'utf8')#与数据库之间创建链接
    #     self.cursor = self.conn.cursor()#创建游标
    # def process_item(self, item, spider):
    #     sql = 'INSERT INTO zl VALUES(NULL ,"%s","%s","%s","%s","%s","%s",)'%(item['name'],item['salary'],item['fuli'],item['address'],item['jingyan'],item['company'])#插入数据库的语句
    #     self.cursor.execute(sql)
    #     self.conn.commit()
    #     return  item
    # def close_spider(self,spider):
    #     self.cursor.close()#关闭游标
    #     self.conn.close()#关闭数据库

猜你喜欢

转载自blog.csdn.net/qq_42817166/article/details/83314118