机器学习之数据处理

 
记录下之前的代码~!!
 
import numpy as np
import pandas as pds import seaborn as sns import matplotlib.pyplot as plt 
 
train=pds.read_csv("./train_V2/train_V2.csv") train.head() 
 
  Id groupId matchId assists boosts damageDealt DBNOs headshotKills heals killPlace ... revives rideDistance roadKills swimDistance teamKills vehicleDestroys walkDistance weaponsAcquired winPoints winPlacePerc
0 7f96b2f878858a 4d4b580de459be a10357fd1a4a91 0 0 0.00 0 0 0 60 ... 0 0.0000 0 0.00 0 0 244.80 1 1466 0.4444
1 eef90569b9d03c 684d5656442f9e aeb375fc57110c 0 0 91.47 0 0 0 57 ... 0 0.0045 0 11.04 0 0 1434.00 5 0 0.6400
2 1eaf90ac73de72 6a4a42c3245a74 110163d8bb94ae 1 0 68.00 0 0 0 47 ... 0 0.0000 0 0.00 0 0 161.80 2 0 0.7755
3 4616d365dd2853 a930a9c79cd721 f1f1f4ef412d7e 0 0 32.90 0 0 0 75 ... 0 0.0000 0 0.00 0 0 202.70 3 0 0.1667
4 315c96c26c9aac de04010b3458dd 6dc8ff871e21e6 0 0 100.00 0 0 0 45 ... 0 0.0000 0 0.00 0 0 49.75 2 0 0.1875

5 rows × 29 columns

 
train.info()
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 983.9+ MB
 

The Killers

 
print("the average persion kill {:.4f} players, 99% people have {} kills or less, while the most kills ever recorded is {}.".format(train["kills"].mean(),train["kills"].quantile(0.99),train["kills"].max())) 
 
the average persion kill 0.9248 players, 99% people have 7.0 kills or less, while the most kills ever recorded is 72.
 
#Lets display the plots
data = train.copy() 
 
data.loc[data["kills"]>data["kills"].quantile(0.99)]="8+" 
 
plt.figure(figsize=(15,8),dpi=80) sns.countplot(data["kills"].astype("str").sort_values()) plt.title("Kill Count",fontsize=15) plt.show() 
 
 
#Most People don't make one kill, At least do they do the damage?
data= train.copy() 
 
data=data[data["kills"]==0] plt.figure(figsize=(15,8),dpi=80) plt.title("Damage Dealth by 0 kill",fontsize=15) sns.distplot(data["damageDealt"]) plt.show() 
 
 
#大多数人又没杀人,也没有伤害。。
#那么看下没有杀人,但是吃鸡的人数以及概率
 
print("{}人一人未杀吃鸡,占总人数的({:.4f}%)".format(len(data[data["winPlacePerc"]==1]),100*len(data[data["winPlacePerc"]==1])/len(train))) 
 
16666人一人未杀吃鸡,占总人数的(0.3748%)
 
data1=train[train["damageDealt"]==0].copy() 
 
print("{}人一点伤害未打出杀吃鸡,占总人数的({:.4f}%)".format(len(data1[data1["winPlacePerc"]==1]),100*len(data1[data1["winPlacePerc"]==1])/len(train))) 
 
4770人一点伤害未打出杀吃鸡,占总人数的(0.1073%)
 
sns.jointplot(x="winPlacePerc",y="kills",data=train,height=10,ratio=3,color="r") plt.show() 
 
 
#Apparentrly killing has a correlation with winning. Finally let's group players based on kills (0 kills, 1-2 kills, 3-5 kills, 6-10 kills and 10+ kills).
 
kills=train.copy() 
 
kills["killsCategory"]=pds.cut(kills["kills"],[-1,0,2,5,10,60],labels=["0_kills","1-2_kills","2-5_kills","5-10_kills","10+kills"]) plt.figure(figsize=(15,8),dpi=80) sns.boxplot(x="killsCategory",y="winPlacePerc",data=kills) plt.show() 
 
 
#Runners
runner_data=train.copy() 
 
print("平均每人跑{}米,99%的人跑了{}米或少于这些,长跑冠军平均跑了{}米".format(runner_data["walkDistance"].mean(),runner_data["walkDistance"].quantile(0.99),runner_data["walkDistance"].max())) 
 
平均每人跑1154.2178590962687米,99%的人跑了4396.0米或少于这些,长跑冠军平均跑了25780.0米
 
runner_data=runner_data[runner_data["walkDistance"]<train["walkDistance"].quantile(0.99)] plt.figure(figsize=(15,8),dpi=80) plt.title("Walking Distince Distribution",fontsize=15) sns.distplot(runner_data["walkDistance"]) plt.show() 
 
 
#吃鸡数据
#win_data=train[train["winPlacePerc"]==1]
#count=len(runner_data[runner_data["walkDistance"]==0])
print("{}玩家({:.4f}%) 跑了0米,也就意味着他们一步没动就死亡了".format(len(runner_data[runner_data["walkDistance"]==0]),100*len(data1[data1["walkDistance"]==0])/len(train))) 
 
99603玩家(2.0329%) 跑了0米,也就意味着他们一步没动就死亡了
 
sns.jointplot(x="winPlacePerc",y="walkDistance",data=train,height=10,ratio=3,color="lime") plt.show() 
 
 
#Drivers
driver_data=train.copy() print("每人平均使用载具运行了{}米,99%的人开了{}米,或少于这些,开车最远开了{}米。".format(driver_data["rideDistance"].mean(),driver_data["rideDistance"].quantile(0.99),driver_data["rideDistance"].max())) 
 
每人平均使用载具运行了606.115669154093米,99%的人开了6966.0米,或少于这些,开车最远开了40710.0米。
 
driver_data=driver_data[driver_data["rideDistance"]<train["rideDistance"].quantile(0.99)] plt.figure(figsize=(15,8),dpi=80) sns.distplot(driver_data["rideDistance"]) plt.title("Ride Distance Distribution") plt.show() 
 
 
print("{} players dirve ({:.4f}%) for 0 meters, This means that they don't have a driving yet".format(len(driver_data[driver_data["rideDistance"]==0]),100*len(data1[data1["rideDistance"]==0])/len(train))) 
 
3309429 players dirve (23.1022%) for 0 meters, This means that they don't have a driving yet
 
sns.jointplot(x="winPlacePerc",y="rideDistance",data=train,height=10,ratio=3,color="y") plt.show() 
 
 
#There is a small correlation between rideDistance and winPlacePerc.
#Let's try the destroy 

f,ax1=plt.subplots(figsize=(15,8)) sns.pointplot(x="vehicleDestroys",y="winPlacePerc",data=driver_data,alpha=0.8) plt.xlabel=("Number of Vehicle Destroys") plt.ylabel=("winPlacePerc") plt.title("Vehicle Destroys/ Win Ratio",fontsize=20,color="blue") plt.grid() plt.show() 
 
 
# this image means the players more destroy the vehicle, get the chicken more Opportunity。
# Heals

heal_data= train.copy() 
 
print("average players use the {:.1f} heals, 99% players use {} heals or less, the doctor in the PUBG use {} heals ".format(heal_data["heals"].mean(),heal_data["heals"].quantile(0.99),heal_data["heals"].max())) 
 
average players use the 1.4 heals, 99% players use 12.0 heals or less, the doctor in the PUBG use 80 heals 
 
print("average players use the {:.1f} boost, 99% players use {} boost or less, the doctor in the PUBG use {} boost ".format(heal_data["boosts"].mean(),heal_data["boosts"].quantile(0.99),heal_data["boosts"].max())) 
 
average players use the 1.1 boost, 99% players use 7.0 boost or less, the doctor in the PUBG use 33 boost 
 
heal_data=heal_data[heal_data["heals"]<train["heals"].quantile(0.99)] heal_data=heal_data[heal_data["boosts"]<train["boosts"].quantile(0.99)] plt.figure() f,ax1=plt.subplots(figsize=(15,8)) sns.pointplot(x="heals",y="winPlacePerc",data=heal_data,color="lime",alpha=0.8,label="heals") sns.pointplot(x="boosts",y="winPlacePerc",data=heal_data,color="blue",alpha=0.8,label="boosts") plt.xlabel=("Number of heal/boost items") plt.ylabel=("winPlacePerc") plt.text(4,0.6,"Heals",color="lime") plt.text(4,0.55,"boosts",color="blue") #plt.legend() plt.title("Heals vs Boosts") plt.grid() plt.show() 
 
<Figure size 432x288 with 0 Axes>
 
 
swim_data=train.copy() #swim_data["swimDistance"].mean() swim_data["swimDistance"].max() 
 
3823.0
 
swim_data=swim_data[swim_data["swimDistance"]>train["swimDistance"].quantile(0.99)] swim_data=train.copy() swim_data["swimDistance"]=pds.cut(swim_data["swimDistance"],[-1,0,5,20,3823],labels=["0m","0-5m","6-20m","20+"]) plt.figure(figsize=(15,8),dpi=80) sns.boxplot(x="swimDistance",y="winPlacePerc",data=swim_data) plt.show() 
 
 
plt.figure(figsize=(15,8),dpi=80) sns.jointplot(x="winPlacePerc",y="heals",data=train,ratio=3,height=10,color="lime") plt.show() 
 
<Figure size 1200x640 with 0 Axes>
 
 
plt.figure(figsize=(15,8),dpi=80) sns.jointplot(x="winPlacePerc",y="boosts",data=train,ratio=3,height=10,color="blue") plt.show() 
 
<Figure size 1200x640 with 0 Axes>
 
 
solos=train[train["numGroups"]>50] duos=train[(train["numGroups"]>25) & (train["numGroups"]<=50)] squads=train[train["numGroups"]<=25] print("{} players ({:.2f}%) play solos,{} players ({:.2f}%) play duos, {} players ({:.2f}%) play squads".format(len(solos),100*len(solos)/len(train),len(duos),100*len(duos)/len(train),len(squads),100*len(squads)/len(train))) 
 
709111 players (15.95%) play solos,3295326 players (74.10%) play duos, 442529 players (9.95%) play squads
 
plt.figure(figsize=(15,8),dpi=80) f,ax1=plt.subplots(figsize=(15,8)) sns.pointplot(x="kills",y="winPlacePerc",data=solos,color="black",alpha=0.8) sns.pointplot(x="kills",y="winPlacePerc",data=duos,color="blue",alpha=0.8) sns.pointplot(x="kills",y="winPlacePerc",data=squads,color="yellow",alpha=0.8) plt.title("winPlacePerc in solos vs duos vs squads", fontsize=15) plt.text(47,0.6,"solos",color="black",fontsize=12) plt.text(47,0.55,"duos",color="blue",fontsize=12) plt.text(47,0.5,"squads",color="yellow",fontsize=12) plt.ylabel=("Win Percentage") plt.xlabel=("Number of kills") plt.grid() plt.show() 
 
<Figure size 1200x640 with 0 Axes>
 
 
f,ax=plt.subplots(figsize=(15,15),dpi=80) sns.heatmap(train.corr(),annot=True,linewidths=.5,fmt=".1f",ax=ax) plt.show 
 
<function matplotlib.pyplot.show(*args, **kw)>
 
 
k=5
f,ax=plt.subplots(figsize=(15,15)) cols=train.corr().nlargest(k,"winPlacePerc")["winPlacePerc"].index cm=np.corrcoef(train[cols].values.T) sns.set(font_scale=1.25) hm=sns.heatmap(cm,cbar=True,annot=True,square=True,fmt=".2f",annot_kws={"size":10},yticklabels=cols.values,xticklabels=cols.values) plt.show() 
 
 
sns.set()
cols=["winPlacePerc","walkDistance","boosts","weaponsAcquired","damageDealt","killPlace"] data_new=train[cols] 
 
data_new
 
  winPlacePerc walkDistance boosts weaponsAcquired damageDealt killPlace
0 0.4444 244.80 0 1 0.000 60
1 0.6400 1434.00 0 5 91.470 57
2 0.7755 161.80 0 2 68.000 47
3 0.1667 202.70 0 3 32.900 75
4 0.1875 49.75 0 2 100.000 45
5 0.0370 34.70 0 1 100.000 44
6 0.0000 13.50 0 1 0.000 96
7 0.7368 1089.00 0 6 8.538 48
8 0.3704 799.90 0 4 51.600 64
9 0.2143 65.67 0 1 37.270 74
10 0.3929 868.30 0 9 28.380 75
11 0.4043 451.70 0 1 137.900 64
12 0.9286 2784.00 0 6 0.000 37
13 0.8750 2050.00 1 6 324.200 5
14 0.9000 1666.00 1 5 122.800 25
15 0.2766 105.10 0 5 80.710 72
16 0.7308 3674.00 2 7 81.710 25
17 0.8211 1787.00 3 3 254.300 13
18 0.1923 137.40 0 2 0.000 79
19 0.9310 3310.00 1 3 65.280 48
20 0.6383 1794.00 4 5 269.100 18
21 0.2143 580.10 0 2 158.700 75
22 0.7500 1264.00 1 4 192.300 15
23 0.9592 2727.00 6 7 1011.000 2
24 0.9231 3503.00 3 4 327.600 3
25 0.8696 2711.00 4 7 558.600 11
26 0.1154 15.30 0 0 44.280 78
27 0.7234 1933.00 4 4 381.200 7
28 0.9630 3855.00 6 4 345.600 6
29 0.0000 0.00 0 0 0.000 87
... ... ... ... ... ... ...
4446936 0.7308 2708.00 1 7 68.200 22
4446937 0.7111 1364.00 2 5 127.400 31
4446938 0.0385 173.10 0 1 0.000 64
4446939 0.3830 56.14 0 1 151.500 35
4446940 0.6250 1362.00 1 6 0.000 58
4446941 0.1600 65.21 0 2 62.350 79
4446942 1.0000 2162.00 4 8 724.700 1
4446943 0.1111 57.59 0 2 0.000 82
4446944 0.6875 1349.00 0 5 175.000 29
4446945 0.1875 57.19 0 1 0.000 81
4446946 0.7292 2591.00 0 7 0.000 53
4446947 0.3830 631.10 0 4 0.000 61
4446948 0.7917 1685.00 3 3 736.500 7
4446949 0.1458 424.60 0 3 100.000 32
4446950 0.5000 1559.00 0 5 203.500 32
4446951 0.1000 44.90 0 1 0.000 85
4446952 0.8462 1177.00 0 5 0.000 44
4446953 0.5926 1025.00 0 5 30.100 57
4446954 0.5306 2146.00 0 6 30.100 58
4446955 0.4792 1158.00 0 3 0.000 60
4446956 0.1071 828.30 0 7 151.900 77
4446957 0.4583 363.70 1 2 100.000 32
4446958 0.0000 0.00 0 0 0.000 92
4446959 0.0842 40.25 0 1 22.680 89
4446960 0.2414 845.60 0 3 327.700 4
4446961 0.1786 1019.00 0 3 0.000 74
4446962 0.2935 81.70 1 6 44.150 69
4446963 0.4815 788.70 0 4 59.060 66
4446964 0.8000 2748.00 4 8 180.400 11
4446965 0.5464 1244.00 2 5 268.000 18

4446966 rows × 6 columns

 
data_new=data_new.iloc[:10000] sns.pairplot(data_new,height=2.5) plt.show() 
 
 
 

猜你喜欢

转载自www.cnblogs.com/knight-vien/p/10405318.html