python3 网络爬虫学习 3.4 抓取猫眼电影排行

import requests
import re
import time
import json
def get_one_page(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1"
    }
    response = requests.get(url,headers = headers)
    if response.status_code == 200:
        return  response.text
    else:
        print(None)
def parse_one_page(html):
    #                         <i class="board-index board-index-9">9</i>
    #     <a href="/films/1212" title="千与千寻" class="image-link" data-act="boarditem-click" data-val="{movieId:1212}">
    #       <img src="//ms0.meituan.net/mywww/image/loading_2.e3d934bf.png" alt="" class="poster-default" />
    #       <img data-src="http://p0.meituan.net/movie/b076ce63e9860ecf1ee9839badee5228329384.jpg@160w_220h_1e_1c" alt="千与千寻" class="board-img" />
    #     </a>
    #     <div class="board-item-main">
    #       <div class="board-item-content">
    #               <div class="movie-item-info">
    #         <p class="name"><a href="/films/1212" title="千与千寻" data-act="boarditem-click" data-val="{movieId:1212}">千与千寻</a></p>
    #         <p class="star">
    #                 主演:柊瑠美,入野自由,夏木真理
    #         </p>
    # <p class="releasetime">上映时间:2001-07-20(日本)</p>    </div>
    #     <div class="movie-item-number score-num">
    # <p class="score"><i class="integer">9.</i><i class="fraction">3</i></p>
    #     </div>
    pattern = re.compile(r"<dd>.*?board-index.*?>(.*?)</i>.*?<img data-src=(.*?) alt=(.*?)class=.*?/>.*?<p class=.*?><a href=.*?</a></p>.*?<p class=.*?>(.*?)</p>.*?<p class=.*?>(.*?)</p>.*?<p class=.*?><i class=.*?>(.*?)</i><i class=.*?>(.*?)</i></p>",re.S)
    item = re.findall(pattern,html)
# ('1', '"http://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg@160w_220h_1e_1c"', '"霸王别姬" ', '\n                主演:张国荣,张丰毅,巩俐\n        ', '上映时间:1993-01-01(中国香港)', '9.', '6'),
    for ite in item:
        yield{
            "index": ite[0],
            "image": ite[1],
            "title": ite[2][1:-2],
            "actor": ite[3].strip()[3:],
            "time": ite[4][5:],
            "score":ite[5]+ite[6]
        }
def write(item):
    with open("D:/猫眼电影TOP100(1).txt","a") as f:
        f.write(json.dumps(item,ensure_ascii=False)+"\n")#这是因为json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False:
def main():
    for page in range(10):
        url = "http://maoyan.com/board/4?offset="+str(10*page)
        html =get_one_page(url)
        for item in parse_one_page(html):
            write(item)
        time.sleep(1)
main()

猜你喜欢

转载自blog.csdn.net/luslin/article/details/81673015
今日推荐