Preface
Overall thinking
aims
Crawl all the pictures under the topic of Douban [Which very philosophical comics have you seen]
Process combing
Try it with ordinary selenium first, and then use scrapy to crawl
Observe the page
Swipe down, the picture will always appear
Ordinary selenium method
Idea:
Use selenium that saves cookies to log in to the page to
locate the image. The code for
one save
operation
is as follows
#Cookie: ll="108297"; bid=Js9xsAq24wE; __yadk_uid=Dl3d4S34ZIAcgpdKiBb7MzSGIeRgFKar; _vwo_uuid_v2=DC3BB855EB5062400749F27AB6BE5CC06|840943523ee629a704e0ce81eb53bca0; __utmz=30149280.1613021276.4.4.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; push_doumail_num=0; __utmv=30149280.20597; douban-profile-remind=1; push_noty_num=0; dbcl2="205973871:7MTt7TSBboU"; ck=oCQO; __utmc=30149280; __utmc=223695111; ap_v=0,6.0; __utma=30149280.320620993.1611971043.1613287832.1613436467.9; __utmt=1; __gads=ID=52803da84dcc685f:T=1613094871:S=ALNI_MZnJgc5Huj3WCQ9GyFukTzESVbz9w; __utmb=30149280.4.10.1613436467; __utma=223695111.1081902582.1611971045.1613289559.1613436530.8; __utmb=223695111.0.10.1613436530; __utmz=223695111.1613436530.8.8.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/gallery/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1613436530%2C%22https%3A%2F%2Fwww.douban.com%2Fgallery%2F%22%5D; _pk_id.100001.4cf6=87abb0aebfe2f1a5.1611971046.7.1613436530.1613289678.; _pk_ses.100001.4cf6=*
import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
import json
browser = webdriver.Chrome()
url = "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%B9%FE%CA%BF%C6%E6&fr=ala&ala=1&alatpl=adress&pos=0&hs=2&xthttps=000000"
headers = { # 模拟浏览器身份头向对方发送消息
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
}
browser.get('https://accounts.douban.com/passport/login?source=movie')
time.sleep(2)
# # 获取cookie并通过json模块将dict转化成str
# dictCookies = browser.get_cookies()
# jsonCookies = json.dumps(dictCookies)
# # 登录完成后,将cookie保存到本地文件
# with open('cookies.json', 'w') as f:
# f.write(jsonCookies)
# 初次建立连接,随后方可修改cookie
browser.get('https://accounts.douban.com/passport/login?source=movie')
# 删除第一次建立连接时的cookie
browser.delete_all_cookies()
# 读取登录时存储到本地的cookie
with open('cookies.json', 'r', encoding='utf-8') as f:
listCookies = json.loads(f.read())
for cookie in listCookies:
browser.add_cookie({
'domain': '.accounts.douban.com', # 此处xxx.com前,需要带点
'name': cookie['name'],
'value': cookie['value'],
'path': '/',
'expires': None
})
# 再次访问页面,便可实现免登陆访问
browser.get('https://www.douban.com/gallery/topic/75561/')
for y in range(30):
js = 'window.scrollBy(0,100)'
browser.execute_script(js)
time.sleep(0.5)
# browser.execute_script('document.documentElement.scrollTop=10000')
# 10000表示一下拉到底
# browser.find_element_by_xpath("/html/body/div[3]/div[1]/div/div[1]/div[1]/div[2]/ul/li[40]/a").click()
# print(content.text)
url_list = browser.find_elements_by_xpath("//li//span//img")
for i,url in enumerate(url_list):
print(url.get_attribute("src"))
content = requests.get(url.get_attribute("src"),headers=headers).content
with open("D:/base/zheli/%s.jpg"%i,"wb") as fp:
fp.write(content)
print("%s下载完成!"%i)
get
Realize with scrapy
The crawl code is as follows
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders.crawl import CrawlSpider,Rule
from scrapy.linkextractors import LinkExtractor
from ..items import ImagedownloadItem
class ZcoolSpider(CrawlSpider):
name = 'zcool'
allowed_domains = ['zcool.com.cn','douban.com']
# start_urls = ['http://zcool.com.cn/']
start_urls = ['https://www.douban.com/gallery/topic/75561/']
rules = (
# 翻页的url
Rule(LinkExtractor(allow=r"https://img3.doubanio.com/view/status/l/public/110897263-2e427f8ee2e927e.jpg"),follow=True,callback="parse_detail"),
# 详情页面的url
# Rule(LinkExtractor(allow=r".+/work/.+html"),follow=False,callback="parse_detail")
)
def parse_detail(self, response):
image_urls = response.xpath("//img/@src").getall()
print(image_urls)
# title_list = response.xpath("//div[@class='details-contitle-box']/h2/text()").getall()
# title = "".join(title_list).strip()
item = ImagedownloadItem(image_urls=image_urls)
yield item
I don’t know why the specified target cannot be crawled, but the pictures of the whole site can be crawled.
However, after setting the header this time, I did not encounter the problem of inaccessibility due to cookies.
To be continued