猫眼电影TOP100榜

import re
import requests


def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        return r.text
    except:
        print('status_code is not 200')
        return None


def parse_html(text):
    score_list = []
    movie_list = re.findall(r'<a href=.* title="(.*?)".*</a>', text)
    rank_list = re.findall(r'<i class="board-index.*">(\d+)</i>', text)
    star_list = re.findall(r'<p class="star">.*?主演:(.*?)\s*</p>', text, re.S)
    time_list = re.findall(r'<p class="releasetime">上映时间:(\d{4}-\d{2}-\d{2}).*?</p>', text)
    score_1_list = re.findall(r'<i class="integer">(.*?)</i>', text)
    score_2_list = re.findall(r'<i class="fraction">(.*?)</i>', text)

    for score_1, score_2 in zip(score_1_list, score_2_list):
        score_list.append(score_1 + score_2)
        
    for rank, movie, star, timestamp, scroe in zip(rank_list, movie_list, star_list, time_list, score_list):
        print('排名:', rank, '电影名称:', movie, '主演:', star, '上映时间:', timestamp, '评分:', scroe)


if __name__ == '__main__':
    url = 'http://maoyan.com/board/4'
    for i in range(10):
        path = url + '?offset=' + str(i*10)
        txt = get_html(path)
        if txt:
            parse_html(txt)

猜你喜欢

转载自www.cnblogs.com/jp-mao/p/10005268.html
今日推荐