一、安装相关软件
1、docker(ubuntu系统) 2、scrapy(pip install) 3、splash 4、scrapy-splash(pip install)
二、splash安装
#简介:splash 能够执行用户使用Lua语言编写的自定义渲染脚本, 这就使我们能够像PhantomJs 那样,将其作为一个浏览器自动化工具来使用 docker pull scrapinghub/splash # 安装 docker run -p 8050:8050 scrapinghub/splash # 运行 浏览器:localhost:8050 #文档:https://splash-cn-doc.readthedocs.io/zh_CN/latest/index.html
三、代码:爬取2020年疫情,新型冠状病毒,网址:https://voice.baidu.com/act/newpneumonia/newpneumonia,页面需要点击div显示内容,需要用到splash
cmd >> scrapy startproject douban ## scrapy startproject project_name cmd >> cd douban/douban/spiders cmd >> scrapy genspider douban_spider movie.douban.com ## scrapy genspider spider_text_name start_url
settings: SPLASH_URL = 'http://172.17.0.1:8050' //terminal->ifconfig查看自己主机号 DOWNLOADER_MIDDLEWARES = { 'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, } SPIDER_MIDDLEWARES = { 'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, } DUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter' HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage' items: class YqItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() sf = scrapy.Field() d1 = scrapy.Field() d2 = scrapy.Field() d3 = scrapy.Field() pass spiders: # -*- coding: utf-8 -*- import scrapy from yq.items import YqItem from scrapy_splash import SplashRequest def script(n): _script = """ function main(splash, args) splash.images_enabled = false assert(splash:go(args.url)) assert(splash:wait(1)) splash:runjs('document.querySelectorAll("td.VirusTable_1-1-83_MdE8uT")[{}].click()') assert(splash:wait(1)) return splash:html() end """.format(n, "{html=splash:html(),png=splash:png(), href=href,}") return _script class YqspiderSpider(scrapy.Spider): name = 'yqspider' allowed_domains = ['voice.baidu.com'] start_urls = ['https://voice.baidu.com/act/newpneumonia/newpneumonia'] global change def parse(self,response): for url in self.start_urls: for i in range(34): yield SplashRequest(url=url, callback=self.s_parse, meta={'index':i}, endpoint='execute', args={'wait': 10, 'images': 0, 'lua_source': script(i)}) def s_parse(self, response): index = response.meta['index'] sf = response.xpath("//td[@class='VirusTable_1-1-83_MdE8uT']//text()")[index].extract() it = YqItem() fr = response.xpath("//td//text()") lens = len(fr) for i in range(16,lens-74,4): print(sf,fr[i].extract(),fr[i+1].extract(),fr[i+2].extract(),fr[i+3].extract()) it['sf']=sf it['name']=fr[i].extract() it['d1']=fr[i+1].extract() it['d2']=fr[i+2].extract() it['d3']=fr[i+3].extract() yield it 数据整理: import csv dict1 = ["北京","天津","河北","山西","内蒙古","辽宁","吉林","黑龙江","上海","江苏","浙江","安徽","福建","江西","山东","河南","湖北","湖南","广东","广西","海南","重庆","四川","贵州","云南","西藏","陕西","甘肃","青海","宁夏","新疆","香港","澳门","台湾"] dict2 = ["武汉","黄冈","孝感","襄阳","随州","宜昌","荆州","荆门","鄂州","黄石","咸宁","十堰","仙桃","恩施","天门","潜江","神农架地区","待确认"] am={} index = 0 print("Start.....") import pandas as pd df = pd.read_csv("A.csv",encoding='gb18030') dflen = len(df) for i in range(dflen): sf = str( df.iloc[i]["sf"] ) name = str( df.iloc[i]["name"] ) data1 = str( df.iloc[i]["d1"] ) data2 = str( df.iloc[i]["d2"] ) data3 = str( df.iloc[i]["d3"] ) if sf in dict1: if name in dict2 and name not in dict1: if "湖北" not in am: am["湖北"]=[["湖北",name,data1,data2,data3]] else: am["湖北"].append(["湖北",name,data1,data2,data3]) if name not in dict2 and name not in dict1: if sf not in am: am[sf]=[[sf,name,data1,data2,data3]] else: am[sf].append([sf,name,data1,data2,data3]) for i in am: print(str(i) + "->" + str(am[i])) awrite = open('yq_C.csv','a', newline='') for i in am: for j in am[i]: csv_write = csv.writer(awrite, dialect='excel') csv_write.writerow(j)
scrapy爬虫_脚本模拟客户端访问(如点击、下滑等):scrapy and splash操作记录(记录用)
猜你喜欢
转载自blog.csdn.net/qq_36336522/article/details/104150411
今日推荐
周排行