Python3, multi-threaded crawling of a small movie ~ ~


Due to this year’s epidemic, we can’t go to the cinema for consumption, and we can’t experience the fun of holding a girlfriend with the right hand and a girlfriend with the left hand.
But even at home, there is a wave...

Directly crawl Douban

None of the above is the point, the point is, we have to see how to climb...do
n’t think about it, we are crawling movies on websites, not climbing mountains! !
Let’s take a look at Douban, the number of movies displayed on each page
Insert picture description here

Here is the code directly, we don’t need a thread pool, let’s see how much time it takes for 10 pages of movie pictures

# -*- coding: utf-8 -*-
"""
@ auth : carl_DJ
@ time : 2020-8-14
"""

import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from concurrent.futures import ThreadPoolExecutor ,wait,ALL_COMPLETED

douban_path = '../py_class/pict'
if not os.path.exists(douban_path):
    os.mkdir(douban_path)

def down_load(url):
	#添加请求头信息,防止被网站屏蔽掉
    headers = {
    
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
    r = requests.get(url,headers=headers)
    soup = BeautifulSoup(r.text,'lxml')
    #找到当前页面所有图片的父div
    content = soup.find('div',class_= 'article') 
    #获取所有电影图片的标签
    images = content.find_all('img')
    #所有图片的下载链接
    pic_link_list = [image['src'] for image in images] 
    #获取所有图片的名字 
    pic_name_list =[image['alt'] for image in images] 
     
    for name,link in zip(pic_name_list,pic_link_list):
        #urlretrieve暂时对 python3.7的支持,不太稳定,运行时报错,
        # urlretrieve(link,f'{douban_path}/{name}.jpg')
        
        #运用老方法来读写
        html = requests.get(link)
        with open(f'{douban_path}/{name}.jpg','wb') as f :
            f.write(html.content)
    print(f'{url}所有电影下载完成')


def main():
    stat_urls = ['https://movie.douban.com/top250',]
	#使用参数start={25*i},因为每页显示25个电影
    for i in range(1,10):
        stat_urls.append(f'https://movie.douban.com/top250?start={25*i}&filter=')
        # print(stat_urls)

    #开始时间
    start_time  = time.time()
	#执行下载动作
    for url in stat_urls:
         down_load(url)

    #结束时间
    end_time = time.time()
    print('='*50)
    print(f'运行时按{end_time -  start_time}')

if __name__ == '__main__':
    main()

We look for the label of class ='article'

content = soup.find('div',class_= 'article')

Because the pictures of each movie we want to crawl are subcategories of class ='article'

There is an old saying: I found Laozi, are you afraid of not being able to find a boy ? ?

Insert picture description here
Finally, the crawling time is 49s +

Insert picture description here

Use thread pool to crawl Douban

We have to see how much speed can be increased by using multithreading

"""
@ auth : carl_DJ
@ time : 2020-8-14
"""

import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from concurrent.futures import ThreadPoolExecutor ,wait,ALL_COMPLETED

douban_path = '../py_class/pict'
if not os.path.exists(douban_path):
    os.mkdir(douban_path)

def down_load(url):
    headers = {
    
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
    r = requests.get(url,headers=headers)
    soup = BeautifulSoup(r.text,'lxml')
    #找到当前页面所有图片的父div
    content = soup.find('div',class_= 'article') 
    #获取所有电影图片的标签
    images = content.find_all('img')
    #所有图片的下载链接
    pic_link_list = [image['src'] for image in images] 
    #获取所有图片的名字 
    pic_name_list =[image['alt'] for image in images] 
     
    for name,link in zip(pic_name_list,pic_link_list):
        #urlretrieve暂时对 python3.7的支持,不太稳定,运行时报错,
        # urlretrieve(link,f'{douban_path}/{name}.jpg')
        
        #运用老方法来读写
        html = requests.get(link)
        with open(f'{douban_path}/{name}.jpg','wb') as f :
            f.write(html.content)
    print(f'{url}所有电影下载完成')

def main():
    stat_urls = ['https://movie.douban.com/top250',]

    for i in range(1,10):
        stat_urls.append(f'https://movie.douban.com/top250?start={25*i}&filter=')
        # print(stat_urls)

    #开始时间
    start_time  = time.time()

   #定义10个线程
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = []  #获取运行结果
        for url in stat_urls:
            future = executor.submit(down_load,url)
            futures.append(future)

    #等到所有线程执行完成,在进行后续逻辑
    wait(futures,return_when=ALL_COMPLETED)

    #结束时间
    end_time = time.time()
    print('='*50)
    #打印时间差
    print(f'运行时按{end_time -  start_time}')

if __name__ == '__main__':
    main()

Let's take a look at the running results, 20s +, it does speed up a lot, so if you can multi-thread, don't single-thread, let more resources move and move ~ ~
Insert picture description here

Finally, let me show you what the crawled pictures look like
Insert picture description here

The picture and name of the movie will be displayed.
؏؏☝ᖗ乛◡乛ᖘ☝؏؏
Finally, I hope everyone can climb to the small movie they want~ ~
But, I have to remind:

Appropriate can be pleasant, too much will hurt your body~

Guess you like

Origin blog.csdn.net/wuyoudeyuer/article/details/107996727