- 爬取TOP100所有电影的信息,(电影名, 主演, 上映时间)
- 爬取该电影的宣传封面的图片, 保存到本地/mnt/img/目录中;
- 将获取的信息, 保存到mysql数据库中(电影名, 主演, 上映时间, 封面图片的本地路径)
import re
from concurrent.futures import ThreadPoolExecutor
import time
import json
import pymysql
from gevent import monkey
monkey.patch_socket()
import gevent
url='http://maoyan.com/board/4?offset=0'
from urllib.request import urlopen
def get_page(url):
page_li=[]
for i in range(0,10):
new_url=url+str(i*10)
page_li.append(new_url)
return page_li
def load_page_content(url):
with urlopen(url,timeout=40) as f:
content= str(f.read().decode('utf-8'))
return content.replace('\n','').replace('\t','')
def get_info(url):
"""爬取每个网页电影的名字,主演以及上映时间"""
content=load_page_content(url)
pattern_rank = r'<i class="board-index board-index-(\d+)">'
film_rank= re.findall(pattern_rank, content)
pattern_name=r'alt="(\w+·?:?\w*)'
film_name=re.findall(pattern_name,content)
pattern_actor=r'class="star">\s*\w+:([\w+·?\w*·?\w*,?]+)'
film_actor=re.findall(pattern_actor,content)
pattern_time=r'上映时间:(\w+-?\w*-?\w*\(?\w*\)?)'
film_time=re.findall(pattern_time,content)
info=[i for i in zip(film_rank,film_name,film_actor,film_time)]
return info
def Mysql_reserve(url):
conn=pymysql.connect(host='localhost',user='root',passwd='123',
db='westos01',charset='utf8')
cur=conn.cursor()
try:
insert_sql='insert into filminfo values(%s,%s,%s,%s);'
info=get_info(url)
cur.executemany(insert_sql,info)
conn.commit()
except Exception as e:
print('To lead mysql failure',e)
else:
print('To lead mysql success')
cur.close()
conn.close()
def main(url):
page_li=get_page(url)
for page_url in page_li:
Mysql_reserve(page_url)
def useTreading(url):
page_li = get_page(url)
with ThreadPoolExecutor(max_workers=4) as pool:
pool.map(Mysql_reserve,page_li)
def geventMain(url):
page_li = get_page(url)
gevents=[gevent.spawn(Mysql_reserve,page_url)for page_url in page_li]
gevent.joinall(gevents)
start=time.time()
main(url)
end=time.time()
print('%s run %s' %(main.__name__,end-start))
start=time.time()
useTreading(url)
end=time.time()
print('%s run %s' %(useTreading.__name__,end-start))
start=time.time()
geventMain(url)
end=time.time()
print('%s run %s' %(geventMain.__name__,end-start))