爬虫爬取猫眼电影排行

爬取排行榜top
有机器验证 运行爬虫前只要人力进一遍网站就行了
边学边敲的

import json
import requests
from lxml import etree


def getOnePage(n):
    url = f'https://maoyan.com/board/4?offset={n*10}'
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
    # header = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0'}
    r = requests.get(url, headers=header)
    return r.text


def parse(text):
    # 初始化 标准化
    html = etree.HTML(text)
    names = html.xpath('//*[@id="app"]/div/div/div[1]/dl/dd/a/@title')
    times = html.xpath('//p[@class="releasetime"]/text()')
    # print(names)
    # zip是拉链函数
    item = {}
    for name, time in zip(names, times):
        item['name'] = name
        item['time'] = time
        yield item


def save2File(data):
    with open('D://Project//Pycharm project//spider//猫眼//movie.json', 'a', encoding='utf-8') as f:
        data = json.dumps(data, ensure_ascii=False) + ',\n'
        f.write(data)


def run():
    for n in range(0, 10):
        page = getOnePage(n)
        items = parse(page)
        for item in items:
            save2File(item)


if __name__ == '__main__':
    run()
发布了149 篇原创文章 · 获赞 14 · 访问量 8976

猜你喜欢

转载自blog.csdn.net/weixin_45485719/article/details/104367163