1 #爬取糗事百科照片(前5页) ·##利用正则表达式 2 import requests #请求数据 4 from urllib import request #请求数据,用这个方便下载照片 5 import re #正则 6 #糗事百科照片地址 7 #普通get请求获取 8 k = 0 9 for i in range(1,6): 10 url = f'https://www.qiushibaike.com/imgrank/page/{i}/' 11 #UA伪装防止识破 12 headers = { 13 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" 14 } 15 #获取响应对象response 16 res = requests.get(url, headers=headers) 17 #利用正则表达式findall,返回列表,re.S 是用来在html中的/t/n等解决方式 18 img_urls = re.findall('<div class="thumb">.*?<img src="(.*?)".*? height="auto">.*?</div>', res.text, re.S) 19 for img_url in img_urls: 20 k += 1 21 img_url = 'https:' + img_url 22 imgName = "./imges/qiushi"+str(i)+str(k)+".jpg" 23 request.urlretrieve(img_url, imgName)
## bs4 文档 https://beautifulsoup.readthedocs.io/zh_CN/v4.4.0/
1 #爬取糗事百科照片(前5页) ##利用bs4 2 import requests #请求数据 3 from bs4 import BeautifulSoup #数据分析 4 from urllib import request #请求数据,用这个方便下载照片 5 #糗事百科照片地址 6 #普通get请求获取 7 k = 0 8 for i in range(1,6): 9 url = 'https://www.qiushibaike.com/imgrank/page/1/' 10 #UA伪装防止识破 11 headers = { 12 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" 13 } 14 #获取响应对象response 15 res = requests.get(url, headers=headers) 16 text = res.text 17 #实例化BeautifulSoup对象 18 soup = BeautifulSoup(text,"lxml") 19 #寻找相关数据 20 img_urls = soup.find_all(class_="illustration") 21 #遍历图片地址 22 for img_url in img_urls: 23 k += 1 24 #拼接完整图片地址 25 img_url = "https:"+ img_url.get("src") 26 #下载图片存放位置名字 27 imgName = "./imges/qiushi"+str(i)+str(k)+".jpg" 28 request.urlretrieve(img_url,imgName)
lxml
1 #爬取糗事百科照片(前5页) 2 import requests #请求数据 3 from lxml import etree#数据分析 4 from urllib import request #请求数据,用这个方便下载照片 5 #糗事百科照片地址 6 #普通get请求获取 7 k = 0 8 for i in range(1,6): 9 url = 'https://www.qiushibaike.com/imgrank/page/1/' 10 #UA伪装防止识破 11 headers = { 12 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" 13 } 14 #获取响应对象response 15 res = requests.get(url, headers=headers) 16 text = res.text 17 # 将字符串格式的文件转化为html文档 18 html = etree.HTML(text) 19 img_urls = html.xpath("//div[@class='thumb']//img/@src") 20 for img_url in img_urls: 21 img_url = "https:"+ img_url 22 k += 1 23 imgName = "./imges/qiushi"+str(i)+str(k)+".jpg" 24 request.urlretrieve(img_url,imgName) 25 print("正在下载ing:%s"%img_url)