Python crawling runaway cartoon dynamic map

Recently, I saw a good Python crawler tutorial on the Internet. After reading it, I have a general understanding of the crawler. Then I wrote a crawler to crawl the animation of the runaway comics . I also attached the original link of the Python crawler tutorial . , there will be a lot of gains after watching the tutorial in its entirety

source code

Not much to say, go directly to the code

# -*- coding: UTF-8 -*-

import requests
import bs4
import sys
import os
import re
from multiprocessing.dummy import Pool as ThreadPool
import urllib3
from tqdm import tqdm
import shutil

baseUrl = 'http://baozoumanhua.com/catalogs/gif'

curDir = os.getcwd()

htmlDir = os.path.join(curDir, 'htmls')

gifDir = os.path.join(curDir, 'gifs')

gifMap = {}

noneCnt = 0

# win文件命名不允许使用的字符
pat = re.compile(r'[\/|\?|\*|:|\||\\|<|>|\s|"]')

total_pages = 1000

get_gifs_bar = ''


def get_html_text(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "Something Wrong!"


def get_pages(num):
    global get_pages_bar
    get_pages_bar = tqdm(total=total_pages, ascii=True)
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('Downloading web pages...')
    num += 1
    pool = ThreadPool(8)
    pool.map(download_page, range(1, num))
    pool.close()
    pool.join()
    get_pages_bar.close()


def get_gif_name(num, item):
    global noneCnt
    author = item.find('a', 'text-overflow').string + ':'
    a_s = item.find_all('a')
    for a in a_s:
        if a.has_attr('data-full-url'):
            gif_name = author + a.string
            return gif_name
    gif_name = author + 'NA' + str(noneCnt)
    noneCnt += 1
    return gif_name


def get_gif_links(item):
    imgs = item.find_all('img', 'lazy lazy-img none')
    links = []
    for img in imgs:
        if img.has_attr('data-original'):
            links.append(img['data-original'])
    return links


def add_gifMap(name, links):
    global gifMap
    if len(links) < 1:
        return
    if len(links) == 1:
        gif_name = name + '.gif'
        gifMap[gif_name] = links[0]
        return
    for i in range(1, len(links) + 1):
        gif_name = name + str(i) + '.gif'
        gifMap[gif_name] = links[i - 1]


def get_gifs(num):
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('Parsing pages...')
    num += 1
    get_links_bar = tqdm(total=total_pages, ascii=True)
    for n in range(1, num):
        file_name = os.path.join(htmlDir, 'page' + str(n) + '.html')
        soup = bs4.BeautifulSoup(open(file_name, 'rb'), 'lxml')
        article = soup.find_all('div', 'article')
        for item in article:
            gif_name = get_gif_name(n, item)
            gif_links = get_gif_links(item)
            add_gifMap(gif_name, gif_links)
        get_links_bar.update(1)
    get_links_bar.close()


def download_gif(name):
    global gifMap
    global pat
    global get_gifs_bar
    file_name = re.sub(pat, '_', name)
    try:
        if os.path.exists(os.path.join(htmlDir, 'gifs', file_name)):
            return
        r = requests.get(gifMap[name], timeout=30, verify=False)
        r.raise_for_status()
        with open(os.path.join(gifDir, file_name), 'wb') as fo:
            fo.write(r.content)
    except:
        tqdm.write('Download ' + name + ' fail...')
    finally:
        get_gifs_bar.update(1)


def downloader():
    total_gifs = len(gifMap.keys())
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('.')
    tqdm.write('Downloading gifs...')
    global get_gifs_bar
    get_gifs_bar = tqdm(total=total_gifs, ascii=True)
    pool = ThreadPool(8)
    pool.map(download_gif, gifMap.keys())
    pool.close()
    pool.join()
    get_gifs_bar.close()


def download_page(num):
    url = baseUrl + '?page=' + str(num)
    file_name = os.path.join(htmlDir, 'page' + str(num) + '.html')
    with open(file_name, 'wb') as fo:
        fo.write(get_html_text(url))
    get_pages_bar.update(1)


def set_env():
    global total_pages
    if os.path.exists(gifDir) and sum([len(x) for _, _, x in os.walk(gifDir)]) > 5000:
        total_pages = 10
        tqdm.write('Find many gifs in dir, just update gifs...')
    if not os.path.exists(gifDir):
        os.mkdir(gifDir)
    if os.path.exists(htmlDir):
        shutil.rmtree(htmlDir)
    os.mkdir(htmlDir)


def main():
    set_env()
    get_pages(total_pages)
    get_gifs(total_pages)
    downloader()
    shutil.rmtree(htmlDir)
    tqdm.write('Congratulatins!!!')
    tqdm.write('All pictures in folder : gifs...')
    tqdm.write('Just open the folder and enjoy yourself!!!')
    os.system('pause')
    return 0


if __name__ == "__main__":
    urllib3.disable_warnings()
    reload(sys)
    sys.setdefaultencoding('utf-8')
    sys.exit(main())

Program running example

The first execution will download 1000 pages of moving pictures, please wait patiently, it may take 30 minutes depending on the network conditions, and ensure that the disk has 13G space
After the software is fully executed once, executing it again will only update the latest 10 pages of animation
After the execution is completed, all animations are saved in the gifs folder under the current folder

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324928814&siteId=291194637