Python timing crawler crawls Weibo hot search data pyecharts dynamic graph display

There is nothing to complain about, now everything is paying for every choice before.
There is no way to go in vain in life, every step counts.

Article Directory

1. Schedule module to execute tasks regularly

There is a lightweight timed task scheduling library in python: schedule. He can complete timed tasks every minute, hour, day, day of the week, and specific date. So it is very convenient for us to perform some lightweight timing tasks.

# 安装
pip install schedule -i http://pypi.douban.com/simple --trusted-host pypi.douban.com

import schedule
import time
 
def run():
    print("I'm doing something...")
 
schedule.every(10).minutes.do(run)    # 每隔十分钟执行一次任务
schedule.every().hour.do(run)         # 每隔一小时执行一次任务
schedule.every().day.at("10:30").do(run)  # 每天的10:30执行一次任务
schedule.every().monday.do(run)  # 每周一的这个时候执行一次任务
schedule.every().wednesday.at("13:15").do(run) # 每周三13:15执行一次任务
 
while True:
    schedule.run_pending()  # run_pending：运行所有可以运行的任务

2. Crawling hot data on Weibo

Insert picture description here

Insert picture description here
Such a web page structure can use the pd.read_html() method to crawl data

# -*- coding: UTF-8 -*-
"""
@File    ：微博热搜榜.py
@Author  ：叶庭云
@Date    ：2020/9/18 15:01
"""
import schedule
import pandas as pd
from datetime import datetime
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
count = 0


def get_content():
    global count   # 全局变量count
    print('----------- 正在爬取数据 -------------')
    url = 'https://s.weibo.com/top/summary?cate=realtimehot&sudaref=s.weibo.com&display=0&retcode=6102'
    df = pd.read_html(url)[0][1:11][['序号', '关键词']]   # 获取热搜前10
    time_ = datetime.now().strftime("%Y/%m/%d %H:%M")     # 获取当前时间
    df['序号'] = df['序号'].apply(int)
    df['热度'] = df['关键词'].str.split('  ', expand=True)[1]
    df['关键词'] = df['关键词'].str.split('  ', expand=True)[0]
    df['时间'] = [time_] * len(df['序号'])
    if count == 0:
        df.to_csv('datas.csv', mode='a+', index=False)
        count += 1
    else:
        df.to_csv('datas.csv', mode='a+', index=False, header=False)


# 定时爬虫
schedule.every(1).minutes.do(get_content)

while True:
    schedule.run_pending()

Weibo hot searches are generally updated every 1 minute, so add a timer to the code. Let the program run for a while, and the data of the hot search on Weibo will be saved in the CSV file.

Three, pyehcarts dynamic graph visualization

1. Basic time carousel diagram

from pyecharts import options as opts
from pyecharts.charts import Bar, Timeline
from pyecharts.faker import Faker
from pyecharts.globals import CurrentConfig, ThemeType

CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'
tl = Timeline(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
for i in range(2015, 2020):
    bar = (
        Bar()
        .add_xaxis(Faker.choose())
        .add_yaxis("商家A", Faker.values())
        .add_yaxis("商家B", Faker.values())
        .set_global_opts(title_opts=opts.TitleOpts("商店{}年商品销售额".format(i)))
    )
    tl.add(bar, "{}年".format(i))
tl.render("timeline_multi_axis.html")

The operation effect is as follows:

Insert picture description here

from pyecharts import options as opts
from pyecharts.charts import Bar, Timeline
from pyecharts.faker import Faker
from pyecharts.globals import ThemeType, CurrentConfig


CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'
tl = Timeline(init_opts=opts.InitOpts(theme=ThemeType.DARK))
for i in range(2015, 2020):
    bar = (
        Bar()
        .add_xaxis(Faker.choose())
        .add_yaxis("商家A", Faker.values(), label_opts=opts.LabelOpts(position="right"))
        .add_yaxis("商家B", Faker.values(), label_opts=opts.LabelOpts(position="right"))
        .reversal_axis()
        .set_global_opts(
            title_opts=opts.TitleOpts("Timeline-Bar-Reversal (时间: {} 年)".format(i))
        )
    )
    tl.add(bar, "{}年".format(i))
tl.render("timeline_bar_reversal.html")

The operation effect is as follows:
Insert picture description here

2. Weibo hot search dynamic map

import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Bar, Timeline, Grid
from pyecharts.globals import ThemeType, CurrentConfig

CurrentConfig.ONLINE_HOST = 'D:/python/pyecharts-assets-master/assets/'
df = pd.read_csv('datas.csv')
# print(df.info())
t = Timeline(init_opts=opts.InitOpts(theme=ThemeType.MACARONS))  # 定制主题
for i in range(34):
    bar = (
        Bar()
        .add_xaxis(list(df['关键词'][i*10: i*10+10][::-1]))         # x轴数据
        .add_yaxis('热度', list(df['热度'][i*10: i*10+10][::-1]))   # y轴数据
        .reversal_axis()     # 翻转
        .set_global_opts(    # 全局配置项
            title_opts=opts.TitleOpts(  # 标题配置项
                title=f"{list(df['时间'])[i*10]}",
                pos_right="5%", pos_bottom="15%",
                title_textstyle_opts=opts.TextStyleOpts(
                    font_family='KaiTi', font_size=24, color='#FF1493'
                )
            ),
            xaxis_opts=opts.AxisOpts(   # x轴配置项
                splitline_opts=opts.SplitLineOpts(is_show=True),
            ),
            yaxis_opts=opts.AxisOpts(   # y轴配置项
                splitline_opts=opts.SplitLineOpts(is_show=True),
                axislabel_opts=opts.LabelOpts(color='#DC143C')
            )
        )
        .set_series_opts(    # 系列配置项
            label_opts=opts.LabelOpts(  # 标签配置
                position="right", color='#9400D3')
        )
    )
    grid = (
        Grid()
            .add(bar, grid_opts=opts.GridOpts(pos_left="24%"))
    )
    t.add(grid, "")
    t.add_schema(
        play_interval=100,          # 轮播速度
        is_timeline_show=False,     # 是否显示 timeline 组件
        is_auto_play=True,          # 是否自动播放
    )

t.render('时间轮播图.html')

The results are as follows:

Insert picture description here