普通 srcapy文件:
scrapy startproject 项目名称
srcapy gensiplder 文件名 域名
#如:srcapy gensiplder baidu baidu.com
#会生成一个baidu.py文件
scrapy crawl 名字
#运行文件
另一种方式:
scrapy startproject 项目名称
srcapy gensiplder -t crawl 文件名 域名
#如:srcapy gensiplder baidu baidu.com
#会生成一个baidu.py文件
scrapy crawl 名字
#运行文件
用浏览器爬取时的setting文件的配置:
DOWNLOADER_MIDDLEWARES = {
'Zhilian.middlewares.ZhilianDownloaderMiddleware': 543,
#这个是下载中间件,如果开启了这个中间件,整个下载器下载的过程,将会经过中间件的过滤
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None,
}
middlewares.py 文件的配置
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
print("下载正在进行。。。")
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
# 当下载器下载url的时候被调用
# 拦截下载器的下载,植入selenium+webdriver
opt = webdriver.ChromeOptions()
opt.add_argument("--headless")
driver = webdriver.Chrome(options=opt)
# 用浏览器发起get请求
driver.get(request.url)
sleep(1)
body = driver.page_source
# 根据浏览器解析出来的网页源码创建出一个响应对象
return HtmlResponse(url=driver.current_url, body=body, encoding='utf-8', request=request)
补充:存入数据库或者csv文件或者json文件
import csv
class InterstingPipeline(object):
def open_spider(self,spider):
self.csv_file = open("u148.csv",'w',encoding='utf-8')
self.csvItems = []
def process_item(self, item, spider):#管道,每次迭代运行一次
csv_item=[]
csv_item.append(item["author"])
csv_item.append(item["title"])
csv_item.append(item["img"])
csv_item.append(item["abstract"])
csv_item.append(item["time"])
self.csvItems.append(csv_item)
return item
def close_spider(self, spider):
write =csv.writer(self.csv_file)
write.writerow(["author","title","img","abstract","time"])
write.writerow(self.csvItems)
self.csv_file.close()
def open_spider(self,spider):
self.zhilian_json = open('zhilian.json','w',encoding='utf-8')
self.items=[]
def process_item(self, item, spider):
self.items.append(dict(item))
return item
def close_spider(self,spider):
self.zhilian_json.write(json.dumps(self.items))
self.zhilian_json.close()
# def open_spider(self,apider):
# self.conn =pymysql.connect(host = '127.0.0.1',port='3306',db='zhilian',user='cy',password = '123456',charset = 'utf8')#与数据库之间创建链接
# self.cursor = self.conn.cursor()#创建游标
# def process_item(self, item, spider):
# sql = 'INSERT INTO zl VALUES(NULL ,"%s","%s","%s","%s","%s","%s",)'%(item['name'],item['salary'],item['fuli'],item['address'],item['jingyan'],item['company'])#插入数据库的语句
# self.cursor.execute(sql)
# self.conn.commit()
# return item
# def close_spider(self,spider):
# self.cursor.close()#关闭游标
# self.conn.close()#关闭数据库