版权声明:本文为博主原创文章,未经博主允许也可以转载,注明转载即可。 https://blog.csdn.net/xiligey1/article/details/84866969
"""TODO: 只抓热门精华可以提高图片质量"""
import re
import requests
from bs4 import BeautifulSoup
def get_page_urls(page_url):
"""获取当前翻页的所有帖子的链接"""
text = str(requests.get(page_url).content, encoding='gbk')
soup = BeautifulSoup(text, 'lxml')
css = 'html body div#main div.t table#ajaxtable tbody tr.tr3.t_one.tac td.tal'
return list(map(lambda x: x.h3.a['href'], soup.select(css)))
def get_page_photourls(page_url):
"""根据详情页地址获取该页的所有图片的网址列表"""
text = str(requests.get(page_url).content, encoding='gbk')
pattern = re.compile('<input src=\'(http:.+?)\' type=\'image\' onclick="window.open')
return re.findall(pattern, text)
def main():
secret = "网址保密,需要请私聊"
url = 'https://%s/thread0806.php?fid=16&search=&page=' % secret
page_number = 173
n = 0
for i in range(page_number):
try:
print('正在打开第%s个翻页' % (i + 1))·
page_url = url + str(i + 1) # 第i页的地址
article_urls = get_page_urls(page_url) # 第i页的所有帖子的地址
for article in article_urls:
print('正在解析网址: https://%s/%s' % (secret,article))
photos = get_page_photourls('https://%s/' % secret + article)
for photo in photos:
n += 1
filename = photo.split('/')[-1]
with open(filename, 'wb') as f:
f.write(requests.get(photo).content)
print('成功下载第%s张图片' % n)
except Exception as e:
print(e)
continue
if __name__ == '__main__':
main()