Python抓取CaoLiu上的图片

版权声明:本文为博主原创文章,未经博主允许也可以转载,注明转载即可。 https://blog.csdn.net/xiligey1/article/details/84866969
"""TODO: 只抓热门精华可以提高图片质量"""
import re

import requests
from bs4 import BeautifulSoup


def get_page_urls(page_url):
    """获取当前翻页的所有帖子的链接"""
    text = str(requests.get(page_url).content, encoding='gbk')
    soup = BeautifulSoup(text, 'lxml')
    css = 'html body div#main div.t table#ajaxtable tbody tr.tr3.t_one.tac td.tal'
    return list(map(lambda x: x.h3.a['href'], soup.select(css)))


def get_page_photourls(page_url):
    """根据详情页地址获取该页的所有图片的网址列表"""
    text = str(requests.get(page_url).content, encoding='gbk')
    pattern = re.compile('<input src=\'(http:.+?)\' type=\'image\' onclick="window.open')
    return re.findall(pattern, text)


def main():
	secret = "网址保密,需要请私聊"
    url = 'https://%s/thread0806.php?fid=16&search=&page=' % secret
    page_number = 173
    n = 0
    for i in range(page_number):
        try:
            print('正在打开第%s个翻页' % (i + 1))·
            page_url = url + str(i + 1)  # 第i页的地址
            article_urls = get_page_urls(page_url)  # 第i页的所有帖子的地址
            for article in article_urls:
                print('正在解析网址: https://%s/%s' % (secret,article))
                photos = get_page_photourls('https://%s/' % secret + article)
                for photo in photos:
                    n += 1
                    filename = photo.split('/')[-1]
                    with open(filename, 'wb') as f:
                        f.write(requests.get(photo).content)
                        print('成功下载第%s张图片' % n)

        except Exception as e:
            print(e)
            continue


if __name__ == '__main__':
    main()

猜你喜欢

转载自blog.csdn.net/xiligey1/article/details/84866969
今日推荐