数据整理
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif']=['Microsoft YaHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
from datetime import datetime
plt.figure(figsize=(16,10))
import pyecharts.options as opts
from pyecharts.charts import Line
from pyecharts.faker import Faker
from pyecharts.charts import Bar
import os
from pyecharts.options.global_options import ThemeType
ssqdata=pd.read_csv("getssq.csv")
ssqdata.head()
1、修改表头
更改表头并去除无用数据:
ssqnames=['date','id','numbers','total','first','second']
ssqdata.to_excel('getssqv2.xlsx')
ssqdatav2=pd.read_excel("getssqv2.xlsx",usecols='B:G',names=ssqnames)
ssqdatav2.head()
更改日期类型:
ssqdatav2['date']=pd.to_datetime(ssqdatav2['date'])
ssqdatav2.dtypes
2、增加时间列
# 增加辅助列
ssqdatav2['ssqyear']=ssqdatav2['date'].dt.year # 如果原来的数据不是 datetime64[ns]类型则不能使用这个函数
ssqdatav2['ssqmonth']=ssqdatav2['date'].dt.month # 月份
ssqdatav2['ssqquarter']='Q'+(ssqdatav2['date'].dt.quarter).apply(str) #不加这个 .apply(str)会报错
ssqdatav2['ssqym']=ssqdatav2['date'].apply(lambda x:x.strftime('%Y%m'))
ssqdatav2['ssqyq']=ssqdatav2['date'].dt.to_period('Q')
ssqdatav2['ssqseason']=ssqdatav2['ssqmonth'].apply(lambda x:'spring' if x<=3 else 'summer' if x<=6 else 'autumn' if x<=9 else 'winter')
ssqdatav2.head()
3、分割获奖号码
# 如何分割获奖号码
import re
numnames=['n01','n02','n03','n04','n05','n06','n07']
# first prize 一等奖
fpnames=['fpcounts','p01','p02','p03','p04']
ssqdatav2[numnames]=ssqdatav2['numbers'].str.split(' ',expand=True).replace() # 注意是两个空格
ssqdatav2.head(2)
4、使用正则表达式获取汉字
# 为分解firstprize定义函数
def fpp(x):
if len(x)<=2: # 判断是否只有汉字,还是也有数字
return "待定"
else: # 使用正则表达式获取中文
pattern="[\u4e00-\u9fa5]"
pat=re.compile(pattern)
return ''.join(pat.findall(x))
#使用fp()
ssqdatav2['fpprovince']=ssqdatav2['first'].apply(lambda x:fpp(x))
ssqdatav2.head(310)
ssqdatav2.dtypes
# total本来是销售额,如何将其格式转换为float或者int类型
# 自定义函数,将文本类型转换成数字类型
def t2f(x):
return float(''.join(re.findall('\d+',x)))
ssqdatav2['total2']=ssqdatav2['total'].apply(lambda x:t2f(x))
5、绘制透视图
# 透视
fig,axes=plt.subplots(2,4,figsize=(10,9))
ssqdatav2['ncount']=1
ssqdatav2.groupby(['n01'])['ncount'].count().plot(ax=axes[0,0])
ssqdatav2.groupby(['n02'])['ncount'].count().plot(ax=axes[0,1])
ssqdatav2.groupby(['n03'])['ncount'].count().plot(ax=axes[0,2])
ssqdatav2.groupby(['n04'])['ncount'].count().plot(ax=axes[0,3])
ssqdatav2.groupby(['n05'])['ncount'].count().plot(ax=axes[1,0])
ssqdatav2.groupby(['n06'])['ncount'].count().plot(ax=axes[1,1])
ssqdatav2.groupby(['n07'])['ncount'].count().plot(ax=axes[1,2])
显示每个号码当中购买频率分布: