又写了个爬取小姐姐图片的爬虫,(比上次写的稍微复杂了点)

 1 #导入模块
 2 import requests,os,sys,re
 3 from bs4 import BeautifulSoup
 4 
 5 
 6 #创建文件夹
 7 path = os.getcwd()
 8 new_path = os.path.join(path,'小姐姐')
 9 if  not os.path.isdir(new_path):
10     os.mkdir(new_path)
11 
12 #开始写爬虫的主体程序
13 
14 #爬虫头部信息,有些网站要有头部信息,不然爬取不到(具体为什么可以自行百度)
15 headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'}
16 response = requests.get('http://www.mmonly.cc/tag/ltmn/',headers = headers)
17 # print(response)
18 
19 #转码编译,response.content 得到的是二进制字符流
20 con = response.content.decode('gb18030')
21 soup = BeautifulSoup(con,'lxml')  #申明获取页面的解析用的解释器
22 #找到class为ABox的标签
23 my_gril = soup.select('.ABox ')
24 # print(soup)
25 # print(my_gril)
26 #利用for in 循环迭代找到套图的网址
27 for gril in my_gril:
28     j = gril.find('a')
29     link = j.get('href')
30     flink = link
31     # print(flink)
32     response = requests.get(flink)
33     con1 = response.content.decode('gb18030')
34     soup1 = BeautifulSoup(con1,'lxml')
35     # print(con1)
36     #找到class为pages 下的所有a标签
37     my_gril1 = soup1.select('.pages ul li a')
38     # print(my_gril1)
39     #得到第一个a标签的内容
40     s = my_gril1[0].get_text()
41     #利用正则,得到图片的总数(网站是一张图片一页,也就是得到套图的页数)
42     num = int(re.sub('\D','',s))
43     # print(num)
44 #利用for in 循环迭代得到每张图片对应的网址
45     for page in range(1,(num+1)):
46         if page == 1:
47             flink1 = link
48         else:
49             flink1 = link.replace('.html','_%s.html' % page)
50         response = requests.get(flink1)
51         con2 = response.content.decode('gb18030')
52         soup2 = BeautifulSoup(con2,'lxml')
53         my_gril2 = soup2.select('#big-pic img')
54         # print(my_gril2)
55         # 找到图片
56         for pic in my_gril2:
57             pic_link = pic.get('src')
58             flink2 = pic_link
59             # print(flink2)
60             response =requests.get(flink2)
61             con3 = response.content
62             #将图片写入'小姐姐'文件夹中
63             with open(u'小姐姐' + '/' + flink2[-11:], 'wb') as code:
64                 code.write(con3)
65 
66 #最后提醒:强撸灰灰湮灭

以前写的算只是爬取了一个套图里的多张图片,现在的是爬取了一个分类下的所有套图中的所有图片(受不鸟,受不鸟啊),主要用到的模块是requests,os,sys,re和BeautifulSoup。

猜你喜欢

转载自www.cnblogs.com/hcq-learning/p/9076997.html
今日推荐