Python grabs cat's eye movie TOP100 data

This article has participated in the "Newcomer Creation Ceremony" activity, and started the road of Nuggets creation together

1. Use the requests library and regular expressions to grab the relevant content of the cat's eye movie TOP100

(1) The website of Maoyan Movie is as follows, with a total of 10 pages.

insert image description heremaoyan.com/board/4?off… maoyan.com/board/4?off… ... maoyan.com/board/4?off…

import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent
复制代码

(2) Define the get_one_page(url) method to obtain the source code of the specified webpage.

def get_one_page(url):
    """
    发送请求,获取响应!
    :param url:
    :return:
    """
    try:
        headers = {
            'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
        }
        response = requests.get(url,timeout=30, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
复制代码

(3) Define the parse_one_page(html) method, parse the source code, and obtain each movie information.

insert image description here

def parse_one_page(html):
    """
    利用正则表达式提取响应里的电影信息,并形成结构化数据!
    :param html:
    :return:
    """
    pattern = re.compile(
        '<dd>'
        '.*?board-index.*?>(.*?)</i>'#index
        '.*?data-src="(.*?)"'#image
        '.*?name.*?a.*?>(.*?)</a>'#title
        '.*?star.*?>(.*?)</p>'#主演
        '.*?releasetime.*?>(.*?)</p>'#上映时间
        '.*?integer.*?>(.*?)</i>'#评分 整数部分
        '.*?fraction.*?>(.*?)</i>'#评分 小数部分
        '.*?</dd>', re.S)
    items = re.findall(pattern, str(html))
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2].strip(),
            'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
            'time' : item[4].strip()[5:] if len(item[4]) > 5 else '',
            'score': item[5].strip() + item[6].strip()
        }
复制代码

(4) Define the write_to_file(content) method to write the movie information into the Excel file.

def write_to_file(content):
    """
    存储数据,通过JSON库的dumps()方法实现字典的序列化,写入到一个文本文件!
    :param content:
    :return:
    """
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + ',\n')
复制代码

(5) Define the main(offset) method and sum up all methods.

def main(offset):
    """
    通过构造URL中的offset参数(偏移量值),实现TOP100十页数据的爬取!
    :param offset:
    :return:
    """
    url = "http://maoyan.com/board/4?offset=" + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)
复制代码

(6) Use a for loop to traverse all URLs.

if __name__ == '__main__':
    for i in range(9):
        main(offset=i * 10)
        time.sleep(5)
复制代码
import re
import json
import time
import requests
from requests.exceptions import RequestException
#from fake_useragent import UserAgent


def get_one_page(url):
    """
    发送请求,获取响应!
    :param url:
    :return:
    """
    try:
        headers = {
            'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
        }
        response = requests.get(url,timeout=30, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def parse_one_page(html):
    """
    利用正则表达式提取响应里的电影信息,并形成结构化数据!
    :param html:
    :return:
    """
    pattern = re.compile('<dd>.*?board-index.*?>(.*?)'
                         '</i>.*?data-src="(.*?)".*?name.*?a.*?>(.*?)'
                         '</a>.*?star.*?>(.*?)'
                         '</p>.*?releasetime.*?>(.*?)'
                         '</p>.*?integer.*?>(.*?)'
                         '</i>.*?fraction.*?>(.*?)'
                         '</i>.*?</dd>',re.S)
    items = re.findall(pattern, str(html))
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2].strip(),
            'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
            'time' : item[4].strip()[5:] if len(item[4]) > 5 else '',
            'score': item[5].strip() + item[6].strip()
        }

def write_to_file(content):
    """
    存储数据,通过JSON库的dumps()方法实现字典的序列化,写入到一个文本文件!
    :param content:
    :return:
    """
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + ',\n')

def main(offset):
    """
    通过构造URL中的offset参数(偏移量值),实现TOP100十页数据的爬取!
    :param offset:
    :return:
    """
    url = "http://maoyan.com/board/4?offset=" + str(offset)
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

if __name__ == '__main__':
    for i in range(1):
        main(offset=i * 10)
        time.sleep(5)

复制代码
import re
import time
import requests
from requests.exceptions import RequestException
import xlwings as xw
#from fake_useragent import UserAgent

def getHTML(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
        }
        response = requests.get(url,timeout=30, headers=headers)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def findMaoyan(html):
    global mlist
    pattern = re.compile(
        '<dd>'
        '.*?board-index.*?>(.*?)</i>'#index
        '.*?data-src="(.*?)"'#image
        '.*?name.*?a.*?>(.*?)</a>'#title
        '.*?star.*?>(.*?)</p>'#主演
        '.*?releasetime.*?>(.*?)</p>'#上映时间
        '.*?integer.*?>(.*?)</i>'#评分 整数部分
        '.*?fraction.*?>(.*?)</i>'#评分 小数部分
        '.*?</dd>', re.S)
    items = re.findall(pattern,str(html))
    for item in items:
        mlist.append([item[0],#index
                      item[1],#image
                      item[2].strip(),#title
                      item[3].strip()[3:] if len(item[3]) > 3 else '',#主演
                      item[4].strip()[5:] if len(item[4]) > 5 else '',#上映时间
                      item[5].strip() + item[6].strip()])#评分
    #print(mlist)
    return mlist

def main():
    global mlist
    mlist = [['index', 'image', 'title', '主演', '上映时间', '评分']]
    for i in range(10):
        url = "http://maoyan.com/board/4?offset=" + str(i*10)
        html = getHTML(url)
        findMaoyan(html)
        time.sleep(1)

    # 写入Excel文件
    wb = xw.Book()
    sht = wb.sheets('Sheet1')
    sht.range('a1').value = mlist  # 将数据添加到表格中

if __name__ == '__main__':
    main()


复制代码

insert image description here insert image description here

Guess you like

Origin juejin.im/post/7079709897515859982