启动文件main.py
from scrapy.cmdline import execute
execute('scrapy crawl tuchong'.split())
在spiders下的执行爬虫文件
import scrapy,json
from ..items import tu_baoc
class TuchongSpider(scrapy.Spider):
name = 'tuchong'
allowed_domains = ['stock.tuchong.com']
def start_requests(self):
for i in range(1,4):
url='https://tuchong.com/rest/2/sites/2634795/posts?count=20&page='+str(i)+'&before_timestamp=0'
req=scrapy.Request(url=url,callback=self.parse)
yield req
def parse(self, response):
t=tu_baoc()
req=response.text
html=json.loads(req)
for i in html['post_list']:
name=i['title']
for j in i['images']:
t['img_url']= 'http://photo.tuchong.com/2634795/f/{}.jpg'.format(j['img_id'])
t['name']=name
print(t['img_url'])
yield t
items.py
# 自定义实例化函数,构造函数
class tu_baoc(scrapy.Item):
# define the fields for your item here like:
# 创建参数
img_url = scrapy.Field()
name=scrapy.Field()
配置文件settings.py
#导包
from ..piaot import *
#ROBOTSTXT_OBEY改成False
ROBOTSTXT_OBEY = False
#开启报头
DEFAULT_REQUEST_HEADERS = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}
#配置存储文件地址和优先级
ITEM_PIPELINES = {
'tu_chong.pipelines.img_URL_baoc': 300,
}
存储文件pipelines.py
import requests,time
class TuChongPipeline(object):
def process_item(self, item, spider):
return item
class img_URL_baoc(object):
def process_item(self, item, spider):
req=requests.get(url=item['img_url'])
x = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))
url="C:/Users/黑神/Desktop/pc_zy/图虫/"+item['name']+x+'.jpg'
with open(url,"wb") as f:
f.write(req.content)