爬取百度图片

因为是百度图片是瀑布流ajax异步上传的数据,所以这里用到抓包工具来抓取链接(fiddler)

好了直接上代码,

 1 from selenium import webdriver
 2 from selenium.webdriver.common.by import By
 3 import requests,time
 4 from queue import Queue
 5 from urllib import request
 6 import os,gevent
 7 from lxml import etree
 8 
 9 
10 
11 
12 def get_img(html):
13     html = html.get()
14 
15     html = etree.HTML(html)
16 
17     img_url = html.xpath('//div[@id="imgid"]/div[last()]//li/@data-objurl')
18     # print(img_url)
19     path = './baidupic/'
20     if not os.path.exists(path):
21         os.makedirs(path)
22 
23     for url in img_url:
24         print(url)
25         # response = requests.get(url)
26         # img = response.content
27         try:
28             fname = url.split('/')[-1]
29             request.urlretrieve(url,os.path.join(path, fname))
30             print('下载成功')
31         except:
32             print('图片不存在')
33 
34 
35 def get_page():
36     #创建数据队列
37     q = Queue()
38 
39     #百度图片搜索地址
40     base_url = 'https://image.baidu.com/'
41     #返回浏览器对象
42     browser = webdriver.Chrome(executable_path=r'C:\Users\zhaozhi\Desktop\chromedriver.exe')
43     #模拟访问
44     browser.get(base_url)
45     #输入搜索关键字
46     browser.find_element_by_id('kw').send_keys('美女')
47     #按键
48     browser.find_element_by_class_name('s_search').click()
49     # time.sleep(2)
50     for i in range(10):
51         browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
52         # time.sleep(2)
53         # html = browser.page_source
54 
55 
56         q.put(browser.page_source)
57     # browser.close()
58     # print(browser.page_source)
59     g_list=[]
60     for i  in range(20):
61         g= gevent.spawn(get_img,q)
62         g_list.append(g)
63 
64     gevent.joinall(g_list)
65 
66 
67 
68 
69 
70 
71 
72 
73 # browser.save_screenshot('baidupic.png')
74 # print(browser.page_source)
75 # browser.find_element(By_)
76 
77 if __name__ == '__main__':
78     get_page()

猜你喜欢

转载自www.cnblogs.com/lyxdw/p/9231515.html
今日推荐