版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
最近在和一个朋友研究爬虫,我和他每人负责一个网站的图片爬取,以下是我负责的网站爬取的code,他的code的文章链接是
https://blog.csdn.net/qq_39305249/article/details/102628783
我的多线程爬取cosplay图片的链接是
https://blog.csdn.net/qq_45026221
import requests
from bs4 import BeautifulSoup
import time
import os
def get_main_urls(headers):
urls = []
for i in range(233):
res = requests.get('https://www.mzitu.com/' + '/page/' + str(i+1), headers=headers)
soup = BeautifulSoup(res.text, 'lxml')
list = soup.find(class_='postlist').find_all('li')
for item in list:
url = item.find('a').get('href')
with open('r.txt', 'a') as f:
f.write(url + ',')
def get_pics_urls(url, headers):
global pic_urls
res2 = requests.get(url, headers=headers)
soup2 = BeautifulSoup(res2.text, 'lxml')
total = soup2.find(class_='pagenavi').find_all('a')[-2].find('span').string
title = soup2.find(class_='main-title').string
index = 1
file_folder = title
folder = 'images/' + file_folder + '/'
if os.path.exists(folder) == False:
os.makedirs(folder)
for i in range(int(total)):
res3 = requests.get(url + '/' + str(i+1), headers=headers)
soup3 = BeautifulSoup(res3.text, 'lxml')
pic_url = soup3.find('img').get('src')
print('downloading......' + title + 'NO.' + str(index))
filename = folder + str(index) + '.jpg'
with open(filename, 'wb') as f:
img = requests.get(pic_url, headers=headers).content
f.write(img)
index += 1
print('当前图集下载完成')
if __name__ == '__main__':
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) App'
'leWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.'
'3865.120 Safari/537.36',
'Referer': 'https://www.mzitu.com/'
}
pic_urls = []
i = 1
print("程序于 {} 开始启动,请等待...".format(time.ctime()))
# get_main_urls(headers)
with open('r.txt', 'r') as f:
urls = f.read().split(',')
for url in urls:
print('正在下载第' + str(i) + '个图集,共5574个图集')
get_pics_urls(url, headers)
i += 1