Reptile write again - crawling cat eye film Top100 list


tags: python, yield, csv, re, urllib

Recently I learned scrapy framework and found that yield shadow, so I plan to take in this chestnut also yield to a training practice. Way to experience the legendary regular use of pain parse the data. In order to better experience the fruits of labor, use csv file format to store data crawling. We found few less data to resolve a few movies do not come out, but the extract alone could not resolve the source data regular again, and found no problems, so regular that one need of a change.

import csv
import time
import re
import urllib.request
# import os, http.cookiejar


# 将数据存到 csv 文件中
def save2csv(generator, filepath, field_names):
    with open(filepath, 'w', encoding="utf-8", newline='', ) as fp:
        # 使用字典的方式写入, 个人比较推荐
        write = csv.DictWriter(fp, fieldnames=field_names)
        # 写入头部信息
        write.writeheader()
        while True:
            try:
                msg_list = next(generator)
                # for msg in msg_list:
                #     write.writerow(msg)
                # 既然已经是 list, 那为什么不用 writerows(内置使用map 高阶函数映射) 呢
                write.writerows(msg_list)
            except StopIteration:
                # 生成器停止, 就是拿不到数据了
                print("StopIteration. data is none.")
                break
            except Exception as e:
                print(e)


# 遍历十个页面 (生成器函数)
def request_url(start_url, headers):
    # 由于已知top100, 有100个电影, 每页 10 个, 所以这里的 for 写死了
    for i in range(10):
        url = start_url.format(i * 10)
        print(url)
        request = urllib.request.Request(url, headers=headers)
        # 不用这种方式了, 直接传参 headers
        # request.add_header("User-Agent", headers["User-Agent"])
        # request.add_header("Cookie", headers["Cookie"])
        response = urllib.request.urlopen(request)
        msg_list = pick_movie_msgs(response)
        print(len(msg_list), msg_list)
        if msg_list:
            yield msg_list
        # 防止把别人的服务器轰坏了, 也防止被封
        time.sleep(3)


# 预编译正则, 终于知道 `.*?` 的厉害之处了
_rule = re.compile(r'<dd>.*?>(\d+)</i>'  # 序号
                   + r'.*?src="https(.*?)".*?>'  # 封面 url **
                   + r'.*?title="(.*?)"'  # 电影名称
                   + r'.*?star">(.*?)</p>'  # 主演
                   + r'.*?releasetime">.*?(\d+-\d+-\d+)'  # 上映时间
                   + r'.*?integer">(\d+.).*?fraction">(\d).*?</dd>', flags=re.S)  # 评分


# 取出有用的信息
def pick_movie_msgs(response):
    page_source_data = response.read()
    page_source = page_source_data.decode("utf-8")
    # print(page_source)
    data_list = _rule.findall(page_source)
    # print(data_list)
    msg_list = []
    for data in data_list:
        # data is type of tuple
        if data:
            msg = {
                "order_number": data[0].strip(),
                "img_url": "".join(("https", data[1].strip())),
                "title": data[2].strip(),
                "star": data[3].strip().split(":")[1],
                "release_time": data[4].strip(),
                "score": "".join((data[5].strip(), data[6].strip())),
            }
            msg_list.append(msg)
    return msg_list


# 程序入口函数
def main():
    # 猫眼电影榜单 top 100
    start_url = "https://maoyan.com/board/4?offset={}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
        "Cookie": "...",
    }
    filepath = './maoyan.csv'  # 文件路径
    data_head = ["order_number", "img_url", "title", "star", "release_time", "score"]
    # 生成器
    generator = request_url(start_url, headers)
    print(type(generator))
    save2csv(generator, filepath, data_head)


# 开始爬
main()

Guess you like

Origin www.cnblogs.com/trent-fzq/p/11204474.html