python爬取猫眼电影信息

  1. 爬取TOP100所有电影的信息,(电影名, 主演, 上映时间)
  2. 爬取该电影的宣传封面的图片, 保存到本地/mnt/img/目录中;
  3. 将获取的信息, 保存到mysql数据库中(电影名, 主演, 上映时间, 封面图片的本地路径)
import re

#<p class="releasetime">上映时间:1998-04-03</p>
from concurrent.futures import ThreadPoolExecutor

import time
import json
import pymysql
from gevent import monkey
monkey.patch_socket()
import gevent


url='http://maoyan.com/board/4?offset=0'

from urllib.request import urlopen


def get_page(url):
    page_li=[]
    for i in range(0,10):
        new_url=url+str(i*10)
        page_li.append(new_url)
    return page_li

def load_page_content(url):
    with urlopen(url,timeout=40) as f:
        content= str(f.read().decode('utf-8'))
        return content.replace('\n','').replace('\t','')

def get_info(url):
    """爬取每个网页电影的名字,主演以及上映时间"""
    content=load_page_content(url)
    pattern_rank = r'<i class="board-index board-index-(\d+)">'
    film_rank= re.findall(pattern_rank, content)
    #电影名
    pattern_name=r'alt="(\w+·?:?\w*)'
    film_name=re.findall(pattern_name,content)
    #电影主演
    pattern_actor=r'class="star">\s*\w+:([\w+·?\w*·?\w*,?]+)'
    film_actor=re.findall(pattern_actor,content)
    #电影上映时间
    pattern_time=r'上映时间:(\w+-?\w*-?\w*\(?\w*\)?)'
    film_time=re.findall(pattern_time,content)
    info=[i for i in zip(film_rank,film_name,film_actor,film_time)]
    return info


# #将爬取到的信息存入数据库
def Mysql_reserve(url):
    conn=pymysql.connect(host='localhost',user='root',passwd='123',
                         db='westos01',charset='utf8')

    cur=conn.cursor()
    try:
        insert_sql='insert into filminfo values(%s,%s,%s,%s);'
        info=get_info(url)

        cur.executemany(insert_sql,info)
        conn.commit()
    except Exception as e:
        print('To lead mysql failure',e)
    else:
        print('To lead mysql success')


    cur.close()
    conn.close()

#
def main(url):
    page_li=get_page(url)
    for page_url in page_li:
      Mysql_reserve(page_url)


# 使用多线程爬取
def useTreading(url):
    page_li = get_page(url)
    with ThreadPoolExecutor(max_workers=4) as pool:
        pool.map(Mysql_reserve,page_li)

#使用协程爬取,顺序不定
def geventMain(url):
    page_li = get_page(url)
    gevents=[gevent.spawn(Mysql_reserve,page_url)for page_url in page_li]
    gevent.joinall(gevents)

start=time.time()
main(url)
end=time.time()
print('%s run %s' %(main.__name__,end-start))


start=time.time()
useTreading(url)
end=time.time()
print('%s run %s' %(useTreading.__name__,end-start))

start=time.time()
geventMain(url)
end=time.time()
print('%s run %s' %(geventMain.__name__,end-start))

猜你喜欢

转载自blog.csdn.net/mashaokang1314/article/details/80815172