2020 Teddy Cup Data Analysis Skills Competition Question B-Data Analysis of New Crown Epidemic

 

Basic processing of task 1 data

Task 1.1 According to the data in Attachment 1 "City Epidemics", count the daily cumulative number of confirmed cases, cumulative number of cured people, and cumulative number of deaths in each city from the first notification of confirmed cases to June 30, and save the result as "task1_1.csv" , the first row is the name of the field, which is placed in columns A ~ E in the order of city, date, cumulative number of confirmed cases, cumulative number of cured people, and cumulative number of deaths. The relevant description of the implementation method is given in the paper, and the statistical results of Wuhan, Shenzhen, and Baoding on the 10th and 25th of each month are listed.

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['font.family'] = 'SimHei'      # 正常显示中文
plt.rcParams['axes.unicode_minus'] = False

import pyecharts
from pyecharts import *
from pyecharts.charts import Bar,Pie,Funnel,Scatter,Gauge,Page,WordCloud,Line
from pyecharts import options as opts

# 任务1_1
data1 = pd.read_excel('附件1.xlsx',sheet_name='城市疫情')

# 填充缺失的日期
date_range = pd.date_range(start=data1.日期.min(),end=data1.日期.max())
date = pd.DataFrame()
data1 = data1.set_index('日期')
for i in data1.城市.unique():

    a = data1[data1['城市']==i].reindex(index = date_range,)
    # 填充新增数据
    a.iloc[:,1:] = a.iloc[:,1:].fillna(0)
    # 填充城市数据
    a = a.fillna(i)
    a = a.reset_index()
    date = pd.concat((date,a))
date= date.rename(columns= {'index':'日期'})

date[['累计确诊人数','累计治愈人数','累计死亡人数']] =date.groupby('城市').cumsum()
date.iloc[:,2:] = date.iloc[:,2:].astype(int)
date_ = date.loc[:,['城市','日期','累计确诊人数','累计治愈人数','累计死亡人数']].copy()
# 保存结果
date_.to_csv('task1_1.csv')

data1:

Display the data of Shenzhen, Wuhan, and Baoding on the 10th and 25th of each month:

date_[date_['日期'].apply(lambda x: x.day==10 or x.day==25)].query("城市 == '深圳'")

date_[date_['日期'].apply(lambda x: x.day==10 or x.day==25)].query("城市 == '武汉'")

date_[date_['日期'].apply(lambda x: x.day==10 or x.day==25)].query("城市 == '保定'")

 

Task 1.2 Based on the results of task 1.1, combined with Attachment 1 "City-Province Comparison Table" to count the daily new and cumulative data of each provincial administrative unit, save the result as "task1_2.csv", the first line is the field name, by province , date, newly confirmed number, newly cured number, newly added death toll, cumulative confirmed number, cumulative cured number, and cumulative death toll are respectively placed in columns A to H. The relevant description of the implementation method is given in the paper, and the statistical results of Hubei, Guangdong, and Hebei on the 15th of each month are listed.

data1_sheng = pd.read_excel('附件1.xlsx',sheet_name='城市省份对照表')

# 通过城市列进行联结
data1_2 = pd.merge(date, data1_sheng, on = '城市')
data1_2 = data1_2.reindex(columns = ['省份','日期','新增确诊','新增治愈','新增死亡','累计确诊人数','累计治愈人数','累计死亡人数'])
data1_2.columns = ['省份','日期','新增确诊人数','新增治愈人数','新增死亡人数','累计确诊人数','累计治愈人数','累计死亡人数']

# 根据省份对该省份的城市数据进行汇总,得到该省份的每日新增确诊人数等。
data1_2 = data1_2.groupby(['省份','日期'],as_index =False).sum()
# 保存结果
data1_2.to_csv('task1_2.csv')

 data1_sheng:

Display the data of Guangdong, Hubei, and Hebei on the 15th of each month:

data1_2[data1_2.日期.apply(lambda x:x.day == 15)].query("省份=='广东'")

data1_2[data1_2.日期.apply(lambda x:x.day == 15)].query("省份=='湖北'")

data1_2[data1_2.日期.apply(lambda x:x.day == 15)].query("省份=='河北'")

analyze:

New epidemic data analysis:

plt.figure(figsize=(13,5))

plt.subplot(131)
data1_2.groupby('日期').sum()['新增确诊人数'].plot(c = 'r')
plt.title('新增确诊')

plt.subplot(132)
data1_2.groupby('日期').sum()['新增治愈人数'].plot(c = 'g')
plt.title('新增治愈')

plt.subplot(133)
data1_2.groupby('日期').sum()['新增死亡人数'].plot(c = 'k')
plt.title('新增死亡')

plt.show()

 

 Cumulative epidemic data analysis:

plt.figure(figsize=(13,5))

plt.subplot(131)
data1_2.groupby('日期').sum()['累计确诊人数'].plot(c = 'r')
plt.title('累计确诊')

plt.subplot(132)
data1_2.groupby('日期').sum()['累计治愈人数'].plot(c = 'g')
plt.title('累计治愈')

plt.subplot(133)
data1_2.groupby('日期').sum()['累计死亡人数'].plot(c = 'k')
plt.title('累计死亡')

plt.show()

 pyecharts drawing:

def plot_(data_name, color):
    d = data1_2.groupby('日期').sum()[data_name]
    d.index = d.index.map(lambda x:x.strftime('%Y/%m/%d'))
    line =Line()
    line.add_xaxis(xaxis_data=d.index.to_list())
    line.add_yaxis(series_name = f'{data_name}'[:-2],y_axis =d.to_list(),is_smooth =False,symbol_size = 10,
                   linestyle_opts=opts.LineStyleOpts(color= color, width=4, type_="solid"),)
    
    line.set_series_opts(markpoint_opts = opts.MarkPointOpts(data = [opts.MarkPointItem(type_= ['max'], symbol_size = 70),
                                                                     opts.MarkPointItem(type_= ['min'])],),
                         itemstyle_opts=opts.ItemStyleOpts(border_width=1, color=color))
    
    line.set_global_opts(title_opts=opts.TitleOpts(title = f'{data_name}随时间变化图'),
                         xaxis_opts=opts.AxisOpts(name= '日期',type_='category', name_location='center',name_gap=25,),
                         yaxis_opts=opts.AxisOpts(name= '人数',type_='value', name_location='end',name_gap=15,
                                                  splitline_opts=opts.SplitLineOpts(is_show=True,
                                                                                    linestyle_opts=opts.LineStyleOpts(opacity=1)),),
                         tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
                                                 )
    return line.render_notebook()

plot_('新增确诊人数', 'red')
# plot_('新增治愈人数', 'blue')
# plot_('新增死亡人数', 'blake')

 

 

 

Task 1.3 According to the results of task 1.2, count the number of hospitalized patients with new crown patients in each provincial administrative unit every day, and save the result as "task1_3.csv". Column ~C column. The related description of the implementation method is given in the paper, and the statistical results of Hubei, Guangdong, and Shanghai on the 20th of each month are listed.

Assuming that the patient is hospitalized quickly after the diagnosis, the number of hospitalizations is defined as the cumulative number of confirmed cases minus the cumulative number of cured patients and the cumulative number of deaths 

data1_3= pd.DataFrame()
data1_3[['省份','日期']] = data1_2.iloc[:,:2]

data1_3['住院人数'] = data1_2['累计确诊人数']-data1_2['累计治愈人数']-data1_2['累计死亡人数']
data1_3.to_csv('task1_3.csv',index=False)

Display the statistical results of Hubei, Guangdong, and Shanghai on the 20th of each month:

data1_3[data1_3.日期.apply(lambda x:x.day == 20)].query("省份=='湖北'")

data1_3[data1_3.日期.apply(lambda x:x.day == 20)].query("省份=='广东'")

data1_3[data1_3.日期.apply(lambda x:x.day == 20)].query("省份=='上海'")

 Task 1.4 assumes that the spread radius of COVID-19 patients is 1 km, draw the epidemic spread risk areas of the city on the 6th day and the 10th day in the plan according to Annex 1 "City A's Epidemic Places", and give the analysis and analysis in the paper Implementation process.

data3 = pd.read_excel('附件1.xlsx',sheet_name='A市涉疫场所分布')

# 结合题意第6天的传播区域因为第8天时的总确诊
a1 = data3[data3['通报日期']<= 8]
# 第10天的传播区域因为第12天时的总确诊
a2 = data3[data3['通报日期']<= 12]

# 常用全局参数配置封装
def global_opts(line,x_name = '',y_name = '',title = '',bottom = None,left = None,split_line = True
               ):
         line.set_global_opts(title_opts=opts.TitleOpts(title = title),
                             xaxis_opts=opts.AxisOpts(name= x_name,type_='value', name_location='end',name_gap=25,max_= 30,
                                                     splitline_opts=opts.SplitLineOpts(is_show=split_line,
                                                                                        linestyle_opts=opts.LineStyleOpts(opacity=1)),
                                                     axistick_opts=opts.AxisTickOpts()),
                             yaxis_opts=opts.AxisOpts(name= y_name,type_='value', name_location='end',name_gap=15,max_= 30,
                                                      splitline_opts=opts.SplitLineOpts(is_show=split_line,
                                                                                        linestyle_opts=opts.LineStyleOpts(opacity=1)),),
                              legend_opts =opts.LegendOpts(type_ = 'scroll',
                                                      pos_bottom=bottom, pos_left = left,
                                                      orient = 'horizontal',align ='left',
                                                      item_gap = 10,item_width = 25,item_height = 15,
                                                      inactive_color = 'break'),
                             tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
                        
                                                     )

 data3:

from pyecharts.charts import EffectScatter

scatter =EffectScatter()
scatter.add_xaxis(a1.iloc[:,2].tolist())
scatter.add_yaxis('',a1.iloc[:,3],itemstyle_opts=opts.ItemStyleOpts(color='red'),
                  symbol_size = 25,
                  label_opts=opts.LabelOpts(is_show=False),)
global_opts(scatter,'横坐标',y_name = '纵坐标',title = '第6天A市疫情传播风险区域')
scatter.render_notebook()

scatter =EffectScatter()
scatter.add_xaxis(a2.iloc[:,2].tolist())
scatter.add_yaxis('',a2.iloc[:,3],
                  symbol_size = 25,
                  itemstyle_opts=opts.ItemStyleOpts(color='red'),label_opts=opts.LabelOpts(is_show=False))
global_opts(scatter,'横坐标',y_name = '纵坐标',title = '第10天A市疫情传播风险区域')

scatter.render_notebook()

 Task 2 digital large screen design

Task 2.1 Design a large digital screen to display the summary information of the domestic new crown epidemic situation, temporal and spatial changes, key areas of concern, etc. Attach screenshots to the paper and give relevant design ideas.

 Use task 1_2 result data data1_2 for plotting

Task 2.2 Design a large digital screen to display and analyze the international epidemic situation and development changes. Attach screenshots to the paper, and give relevant analysis and design ideas.

 Data before processing:

Fill in the missing date during the period (the analysis results can find that there is no date, you can skip it directly)

data = pd.read_excel('附件1.xlsx',sheet_name='国际疫情'

# 对国际疫情表参照任务一的方式进行日期填充
date = pd.DataFrame()
data = data.set_index('日期')
print(data)
for i in data.国家.unique():
    a = data[data['国家']==i]
    date_range = pd.date_range(start=a.index.min(),end=a.index.max())
    a = a.reindex(index = date_range,)

    print(a,a.isna().sum())
    a = a.fillna(method = 'ffill')
    a = a.fillna(method = 'bfill')
    print(a,a.isna().sum(),'+++++')
    a = a.reset_index()
    date = pd.concat((date,a))
date= date.rename(columns= {'index':'日期'})

Processed data:

  Get 'new' data:

# 初始化
data = pd.DataFrame()
for i in date.国家.unique():
    _ =date[date['国家']== i].copy()
    _[['新增确诊','新增治愈','新增死亡']] = pd.concat((_.iloc[0:1,2:],_.iloc[:,2:].diff().iloc[1:,:]))
    data = pd.concat((data,_))

# 保存数据用于可视化大屏绘制
data.to_excel(r'data_guoji.xlsx')

Final data:

  The big screen has not been done, you can refer to the big screen in domestic cities

Task 3 Analysis of the Development of the International Epidemic

Task 3.1 Based on the data in Annex 1 "International Epidemic", divide the epidemic development stages of India, Iran, Italy, Canada, Peru, and South Africa in each time period, and give the basis and results of the division in the paper .

did not do

Task 3.2 Based on the information in Appendix 2, analyze the impact of the epidemic prevention and control measures introduced by the United States, the United Kingdom, and Russia on the changes in the epidemic situation in their own countries.

data['日期'] = pd.to_datetime(data['日期'])
with sns.color_palette('RdYlGn'):
    fig, axes = plt.subplots(2, 3, figsize = (14,8))
    for num, country in enumerate(['美国', '英国', '俄罗斯']):
        d_ = data.query(f"国家 == '{country}'").set_index('日期')
        d_[['累计确诊', '累计治愈', '累计死亡']].plot(ax = axes[0][num])
        axes[0][num].set_title(country)
        axes[0][num].set_xlabel(None)

        d_[['新增确诊', '新增治愈', '新增死亡']].plot(ax = axes[1][num])
        axes[1][num].set_title(country)
        axes[1][num].set_xlabel(None)

    plt.subplots_adjust(0.2,0.1)

The complete project information of the three Teddy Cup analysis competitions in the blog can be added to WeChat: gjwtxp (20 yuan)

Guess you like

Origin blog.csdn.net/weixin_46707493/article/details/127162800