Python implements maoyan box office data and writes it to MySQL

Learning record:

Table of contents

1. Target URL analysis

2. Code implementation

3. Effect display


1. Target URL analysis

Web page: aHR0cHM6Ly9waWFvZmFuZy5tYW95YW4uY29tL2Rhc2hib2FyZA==

interface: =

Web page source code view:

View parameters: 

Return to the interface to view:

Analysis : the interface is a get request, some of its parameters are encrypted by JS, and the comprehensive box office in the data returned by the target interface is font anti-climbing. The interface refreshes the returned data in real time, and the font mapping file it returns is also transformed in real time.

Implementation steps :

1. F12 for JS reverse debugging, analyze parameters

 2. Simulate sending requests, get interface parameters for data analysis

 3. Obtain and download font files, analyze the mapping relationship, and introduce ddddocr for font recognition

 4. Analyze the data, compare and replace fonts in a loop

 5. MySQL creates a new corresponding table, and pymysql operates MySQL to store data

Create table:

 Operation database:

2. Code implementation

MySQL:

create table MAOYAN
(
ID  int(100) AUTO_INCREMENT  not null primary key,
电影id varchar(1000) null,
电影名称 varchar(1000) null,
上映天数 varchar(1000) null,
排片场次 varchar(1000) null,
票房占比 varchar(1000) null,
总票房 varchar(1000) null,
综合票房 varchar(1000) null
);

SELECT * FROM MAOYAN

drop table maoyan

Python:

"""
CSDN: 抄代码抄错的小牛马
mailbox:[email protected]
"""
import base64
import hashlib
import random
import re
import io
import ddddocr

import pymysql
import requests
import time
import uuid
from userAgentPooL import userAgent

from fontTools.ttLib import TTFont
from PIL import ImageFont, Image, ImageDraw

ua = userAgent.get_ua()
ocr = ddddocr.DdddOcr()

url = 'https://piaofang.maoyan.com/dashboard-ajax'

headers = {
    "Accept": "application/json, text/plain, */*",
    "Referer": "https://piaofang.maoyan.com/dashboard",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.35",
}


# 获取接口返回的数据
def get_data(uuid, ua):
    uuid = uuid.uuid4()  # 基于随机数的uuid
    timeStamp = int(time.time() * 1000)
    encrypt = base64.b64encode(ua.encode('utf-8'))  # 返回的结果为字节
    ua = str(encrypt.decode('utf-8'))  # 以 utf 解码
    index = int(1000 * random.random() + 1)
    strs = f'method=GET&timeStamp={timeStamp}&User-Agent={ua}&index={index}&channelId=40009&sVersion=2&key=A013F70DB97834C0A5492378BD76C53A'

    signKey = hashlib.md5(strs.encode('utf-8')).hexdigest()
    params = {
        "orderType": "0",  # 电影票房类型: 0-综合票房  1-分账票房
        "uuid": str(uuid),  # 固定 也可给随机
        "timeStamp": timeStamp,  # 13位时间戳
        "User-Agent": ua,  # ua Base64 加密形成
        "index": index,  # 1000 随机数 + 1
        "channelId": "40009",  # 固定
        "sVersion": "2",  # 固定
        "signKey": signKey  # 标准MD5 加密形成
    }

    resp = requests.get(url=url, headers=headers, params=params).json()

    return resp


# 处理拿到的数据
def process(resp, connect):
    font_json = resp['fontStyle']
    fonturl = 'http:' + re.search('opentype"\),url\("(//.*?\.woff)"', font_json).group(1)
    print('字体下载链接为:', fonturl)
    font_data = requests.get(fonturl)

    # 将字体下载到本地
    with open('maoyan.woff', mode='wb') as f:
        f.write(font_data.content)

    # 读取本地字体文件
    font_file = TTFont('maoyan.woff')
    uni_list = font_file.getGlyphOrder()[2:]  # 前两项无值
    print('uni列表为:', uni_list)

    # 将uni字符画到im, 进而使用 ocr 进行识别获得对应数字
    charList = []
    font = ImageFont.truetype('maoyan.woff', 40)
    for uchar in uni_list:
        unknown_char = f'\\u{uchar[3:]}'.encode().decode('unicode_escape')
        im = Image.new(mode='RGB', size=(42, 40), color='white')
        draw = ImageDraw.Draw(im=im)
        draw.text(xy=(0, 0), text=unknown_char, fill=0, font=font)
        img_byte = io.BytesIO()
        im.save(img_byte, format='JPEG')
        charList.append(ocr.classification(img_byte.getvalue()))
    print('对应字符为:', charList)

    # 解析数据 循环替换
    movieList = resp['movieList']['data']['list']

    movieName = [i['movieInfo']['movieName'] for i in movieList]  # 电影名称
    movieId = [i['movieInfo']['movieId'] for i in movieList]  # 电影id
    releaseInfo = [i['movieInfo']['releaseInfo'] for i in movieList]  # 上映天数

    showCount = [i['showCount'] for i in movieList]  # 排片场次
    boxRate = [i['boxRate'] for i in movieList]  # 票房占比
    sumBoxDesc = [i['sumBoxDesc'] for i in movieList]  # 总票房

    boxSplitUnit = [i['boxSplitUnit']['num'] for i in movieList]  # 要替换的 综合票房

    # 循环对比 替换 字符
    for item in range(len(boxSplitUnit)):
        # for item in range(1):
        end_num = ''
        uni_strs = boxSplitUnit[item].split(';')
        for a in uni_strs:
            if a == '':
                continue
            end_uni = 'uni' + a.replace('&#x', '').replace('.', '').upper()  # 替换并将小写转为大写
            for b in range(len(uni_list)):
                if end_uni == uni_list[b]:
                    if '.' in a:
                        end_num += '.' + charList[b]
                    else:
                        end_num += charList[b]
        # print('综合票房:', end_num)
        end_data = f'电影名称:{movieName[item]}  电影id:{movieId[item]}  上映天数:{releaseInfo[item]}  排片场次:{showCount[item]}  票房占比:{boxRate[item]}  总票房:{sumBoxDesc[item]}  综合票房:{end_num}万'
        print(end_data)
        value = (movieId[item],
                 movieName[item], releaseInfo[item], showCount[item], boxRate[item], sumBoxDesc[item],
                 end_num + '万')

        save_data(connect=connect, value=value)
    pass


# 连接数据库
def connect():
    db = pymysql.connect(host='127.0.0.1',  # 服务器名或本地IP
                         user='root',  # 账户
                         这里是密码英文单词='******',  # 自己设置的密码 
                         database='yxhdatabase')  # 你要连接的数据库名

    if db:
        print("恭喜你,连接成功 !!!")
    return db


# 操作数据库
def save_data(connect, value):
    cur = connect.cursor()  # 创建游标对象
    sql = 'insert into maoyan(电影id, 电影名称, 上映天数, 排片场次, 票房占比, 总票房, 综合票房) values(%s, %s, %s, %s, %s, %s, %s)'  # 创建sql语句
    cur.execute(sql, value)  # 执行sql语句

    connect.commit()  # 提交
    cur.close()  # 关闭游标
    return cur


if __name__ == '__main__':
    connect = connect()
    resp = get_data(uuid, ua)
    process(resp, connect)
    print('===============数据已写入MySQL,请检查!!!========================')
    connect.close()  # 关闭连接

3. Effect display

Guess you like

Origin blog.csdn.net/qq_61122628/article/details/130663247