Python two-color ball data collation

Data curation

import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif']=['Microsoft YaHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
from datetime import datetime
plt.figure(figsize=(16,10))
import pyecharts.options as opts
from pyecharts.charts import Line
from pyecharts.faker import Faker
from pyecharts.charts import Bar
import os
from pyecharts.options.global_options import ThemeType
ssqdata=pd.read_csv("getssq.csv")
ssqdata.head()

insert image description here

1. Modify the header

Change the header and remove the useless data:

ssqnames=['date','id','numbers','total','first','second']
ssqdata.to_excel('getssqv2.xlsx')
ssqdatav2=pd.read_excel("getssqv2.xlsx",usecols='B:G',names=ssqnames)
ssqdatav2.head()

insert image description here
Change date type:

ssqdatav2['date']=pd.to_datetime(ssqdatav2['date'])
ssqdatav2.dtypes

insert image description here

2. Add time column

# 增加辅助列
ssqdatav2['ssqyear']=ssqdatav2['date'].dt.year  # 如果原来的数据不是 datetime64[ns]类型则不能使用这个函数
ssqdatav2['ssqmonth']=ssqdatav2['date'].dt.month # 月份
ssqdatav2['ssqquarter']='Q'+(ssqdatav2['date'].dt.quarter).apply(str) #不加这个 .apply(str)会报错
ssqdatav2['ssqym']=ssqdatav2['date'].apply(lambda x:x.strftime('%Y%m'))
ssqdatav2['ssqyq']=ssqdatav2['date'].dt.to_period('Q')
ssqdatav2['ssqseason']=ssqdatav2['ssqmonth'].apply(lambda x:'spring' if x<=3 else 'summer' if x<=6 else 'autumn' if x<=9 else 'winter')
ssqdatav2.head()

insert image description here

3. Split winning numbers

# 如何分割获奖号码 
import re 
numnames=['n01','n02','n03','n04','n05','n06','n07']
# first prize 一等奖 
fpnames=['fpcounts','p01','p02','p03','p04']
ssqdatav2[numnames]=ssqdatav2['numbers'].str.split('  ',expand=True).replace()  # 注意是两个空格
ssqdatav2.head(2) 

insert image description here

4. Use regular expressions to get Chinese characters

insert image description here

# 为分解firstprize定义函数
def fpp(x):
    if len(x)<=2:  # 判断是否只有汉字,还是也有数字
        return "待定"
    else: # 使用正则表达式获取中文
        pattern="[\u4e00-\u9fa5]"
        pat=re.compile(pattern)
        return ''.join(pat.findall(x))
#使用fp() 
ssqdatav2['fpprovince']=ssqdatav2['first'].apply(lambda x:fpp(x))
ssqdatav2.head(310)

insert image description here
insert image description here
insert image description here

ssqdatav2.dtypes
# total本来是销售额,如何将其格式转换为float或者int类型
# 自定义函数,将文本类型转换成数字类型  
def t2f(x):
    return float(''.join(re.findall('\d+',x)))
ssqdatav2['total2']=ssqdatav2['total'].apply(lambda x:t2f(x))

5. Draw a perspective view

# 透视
fig,axes=plt.subplots(2,4,figsize=(10,9))
ssqdatav2['ncount']=1
ssqdatav2.groupby(['n01'])['ncount'].count().plot(ax=axes[0,0])
ssqdatav2.groupby(['n02'])['ncount'].count().plot(ax=axes[0,1])
ssqdatav2.groupby(['n03'])['ncount'].count().plot(ax=axes[0,2])
ssqdatav2.groupby(['n04'])['ncount'].count().plot(ax=axes[0,3])
ssqdatav2.groupby(['n05'])['ncount'].count().plot(ax=axes[1,0])
ssqdatav2.groupby(['n06'])['ncount'].count().plot(ax=axes[1,1])
ssqdatav2.groupby(['n07'])['ncount'].count().plot(ax=axes[1,2])

Display the frequency distribution of purchases among each number:

insert image description here

Guess you like

Origin blog.csdn.net/wxfighting/article/details/124203195