记录下之前的代码~！！

import numpy as np
import pandas as pds import seaborn as sns import matplotlib.pyplot as plt

train=pds.read_csv("./train_V2/train_V2.csv") train.head()

	Id	groupId	matchId	assists	damageDealt	killPlace	...	rideDistance	swimDistance	walkDistance	weaponsAcquired	winPoints	winPlacePerc
0	7f96b2f878858a	4d4b580de459be	a10357fd1a4a91	0	0.00	60	...	0.0000	0.00	244.80	1	1466	0.4444
1	eef90569b9d03c	684d5656442f9e	aeb375fc57110c	0	91.47	57	...	0.0045	11.04	1434.00	5	0	0.6400
2	1eaf90ac73de72	6a4a42c3245a74	110163d8bb94ae	1	68.00	47	...	0.0000	0.00	161.80	2	0	0.7755
3	4616d365dd2853	a930a9c79cd721	f1f1f4ef412d7e	0	32.90	75	...	0.0000	0.00	202.70	3	0	0.1667
4	315c96c26c9aac	de04010b3458dd	6dc8ff871e21e6	0	100.00	45	...	0.0000	0.00	49.75	2	0	0.1875

5 rows × 29 columns

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4446966 entries, 0 to 4446965
Data columns (total 29 columns):
Id                 object
groupId            object
matchId            object
assists            int64
boosts             int64
damageDealt        float64
DBNOs              int64
headshotKills      int64
heals              int64
killPlace          int64
killPoints         int64
kills              int64
killStreaks        int64
longestKill        float64
matchDuration      int64
matchType          object
maxPlace           int64
numGroups          int64
rankPoints         int64
revives            int64
rideDistance       float64
roadKills          int64
swimDistance       float64
teamKills          int64
vehicleDestroys    int64
walkDistance       float64
weaponsAcquired    int64
winPoints          int64
winPlacePerc       float64
dtypes: float64(6), int64(19), object(4)
memory usage: 983.9+ MB

The Killers

print("the average persion kill {:.4f} players, 99% people have {} kills or less, while the most kills ever recorded is {}.".format(train["kills"].mean(),train["kills"].quantile(0.99),train["kills"].max()))

the average persion kill 0.9248 players, 99% people have 7.0 kills or less, while the most kills ever recorded is 72.

#Lets display the plots
data = train.copy()

data.loc[data["kills"]>data["kills"].quantile(0.99)]="8+"

plt.figure(figsize=(15,8),dpi=80) sns.countplot(data["kills"].astype("str").sort_values()) plt.title("Kill Count",fontsize=15) plt.show()

#Most People don't make one kill, At least do they do the damage?
data= train.copy()

data=data[data["kills"]==0] plt.figure(figsize=(15,8),dpi=80) plt.title("Damage Dealth by 0 kill",fontsize=15) sns.distplot(data["damageDealt"]) plt.show()

#大多数人又没杀人，也没有伤害。。
#那么看下没有杀人，但是吃鸡的人数以及概率

print("{}人一人未杀吃鸡，占总人数的({:.4f}%)".format(len(data[data["winPlacePerc"]==1]),100*len(data[data["winPlacePerc"]==1])/len(train)))

16666人一人未杀吃鸡，占总人数的(0.3748%)

data1=train[train["damageDealt"]==0].copy()

print("{}人一点伤害未打出杀吃鸡，占总人数的({:.4f}%)".format(len(data1[data1["winPlacePerc"]==1]),100*len(data1[data1["winPlacePerc"]==1])/len(train)))

4770人一点伤害未打出杀吃鸡，占总人数的(0.1073%)

sns.jointplot(x="winPlacePerc",y="kills",data=train,height=10,ratio=3,color="r") plt.show()

#Apparentrly killing has a correlation with winning. Finally let's group players based on kills (0 kills, 1-2 kills, 3-5 kills, 6-10 kills and 10+ kills).

kills=train.copy()

kills["killsCategory"]=pds.cut(kills["kills"],[-1,0,2,5,10,60],labels=["0_kills","1-2_kills","2-5_kills","5-10_kills","10+kills"]) plt.figure(figsize=(15,8),dpi=80) sns.boxplot(x="killsCategory",y="winPlacePerc",data=kills) plt.show()

#Runners
runner_data=train.copy()

print("平均每人跑{}米，99%的人跑了{}米或少于这些，长跑冠军平均跑了{}米".format(runner_data["walkDistance"].mean(),runner_data["walkDistance"].quantile(0.99),runner_data["walkDistance"].max()))

平均每人跑1154.2178590962687米，99%的人跑了4396.0米或少于这些，长跑冠军平均跑了25780.0米

runner_data=runner_data[runner_data["walkDistance"]<train["walkDistance"].quantile(0.99)] plt.figure(figsize=(15,8),dpi=80) plt.title("Walking Distince Distribution",fontsize=15) sns.distplot(runner_data["walkDistance"]) plt.show()

#吃鸡数据
#win_data=train[train["winPlacePerc"]==1]
#count=len(runner_data[runner_data["walkDistance"]==0])
print("{}玩家({:.4f}%) 跑了0米，也就意味着他们一步没动就死亡了".format(len(runner_data[runner_data["walkDistance"]==0]),100*len(data1[data1["walkDistance"]==0])/len(train)))

99603玩家(2.0329%) 跑了0米，也就意味着他们一步没动就死亡了

sns.jointplot(x="winPlacePerc",y="walkDistance",data=train,height=10,ratio=3,color="lime") plt.show()

#Drivers
driver_data=train.copy() print("每人平均使用载具运行了{}米,99%的人开了{}米，或少于这些,开车最远开了{}米。".format(driver_data["rideDistance"].mean(),driver_data["rideDistance"].quantile(0.99),driver_data["rideDistance"].max()))

每人平均使用载具运行了606.115669154093米,99%的人开了6966.0米，或少于这些,开车最远开了40710.0米。

driver_data=driver_data[driver_data["rideDistance"]<train["rideDistance"].quantile(0.99)] plt.figure(figsize=(15,8),dpi=80) sns.distplot(driver_data["rideDistance"]) plt.title("Ride Distance Distribution") plt.show()

print("{} players dirve ({:.4f}%) for 0 meters, This means that they don't have a driving yet".format(len(driver_data[driver_data["rideDistance"]==0]),100*len(data1[data1["rideDistance"]==0])/len(train)))

3309429 players dirve (23.1022%) for 0 meters, This means that they don't have a driving yet

sns.jointplot(x="winPlacePerc",y="rideDistance",data=train,height=10,ratio=3,color="y") plt.show()

#There is a small correlation between rideDistance and winPlacePerc.
#Let's try the destroy 

f,ax1=plt.subplots(figsize=(15,8)) sns.pointplot(x="vehicleDestroys",y="winPlacePerc",data=driver_data,alpha=0.8) plt.xlabel=("Number of Vehicle Destroys") plt.ylabel=("winPlacePerc") plt.title("Vehicle Destroys/ Win Ratio",fontsize=20,color="blue") plt.grid() plt.show()

# this image means the players more destroy the vehicle, get the chicken more Opportunity。
# Heals

heal_data= train.copy()

print("average players use the {:.1f} heals, 99% players use {} heals or less, the doctor in the PUBG use {} heals ".format(heal_data["heals"].mean(),heal_data["heals"].quantile(0.99),heal_data["heals"].max()))

average players use the 1.4 heals, 99% players use 12.0 heals or less, the doctor in the PUBG use 80 heals

print("average players use the {:.1f} boost, 99% players use {} boost or less, the doctor in the PUBG use {} boost ".format(heal_data["boosts"].mean(),heal_data["boosts"].quantile(0.99),heal_data["boosts"].max()))

average players use the 1.1 boost, 99% players use 7.0 boost or less, the doctor in the PUBG use 33 boost

heal_data=heal_data[heal_data["heals"]<train["heals"].quantile(0.99)] heal_data=heal_data[heal_data["boosts"]<train["boosts"].quantile(0.99)] plt.figure() f,ax1=plt.subplots(figsize=(15,8)) sns.pointplot(x="heals",y="winPlacePerc",data=heal_data,color="lime",alpha=0.8,label="heals") sns.pointplot(x="boosts",y="winPlacePerc",data=heal_data,color="blue",alpha=0.8,label="boosts") plt.xlabel=("Number of heal/boost items") plt.ylabel=("winPlacePerc") plt.text(4,0.6,"Heals",color="lime") plt.text(4,0.55,"boosts",color="blue") #plt.legend() plt.title("Heals vs Boosts") plt.grid() plt.show()

<Figure size 432x288 with 0 Axes>

swim_data=train.copy() #swim_data["swimDistance"].mean() swim_data["swimDistance"].max()

3823.0

swim_data=swim_data[swim_data["swimDistance"]>train["swimDistance"].quantile(0.99)] swim_data=train.copy() swim_data["swimDistance"]=pds.cut(swim_data["swimDistance"],[-1,0,5,20,3823],labels=["0m","0-5m","6-20m","20+"]) plt.figure(figsize=(15,8),dpi=80) sns.boxplot(x="swimDistance",y="winPlacePerc",data=swim_data) plt.show()

plt.figure(figsize=(15,8),dpi=80) sns.jointplot(x="winPlacePerc",y="heals",data=train,ratio=3,height=10,color="lime") plt.show()

<Figure size 1200x640 with 0 Axes>

plt.figure(figsize=(15,8),dpi=80) sns.jointplot(x="winPlacePerc",y="boosts",data=train,ratio=3,height=10,color="blue") plt.show()

<Figure size 1200x640 with 0 Axes>

solos=train[train["numGroups"]>50] duos=train[(train["numGroups"]>25) & (train["numGroups"]<=50)] squads=train[train["numGroups"]<=25] print("{} players ({:.2f}%) play solos,{} players ({:.2f}%) play duos, {} players ({:.2f}%) play squads".format(len(solos),100*len(solos)/len(train),len(duos),100*len(duos)/len(train),len(squads),100*len(squads)/len(train)))

709111 players (15.95%) play solos,3295326 players (74.10%) play duos, 442529 players (9.95%) play squads

plt.figure(figsize=(15,8),dpi=80) f,ax1=plt.subplots(figsize=(15,8)) sns.pointplot(x="kills",y="winPlacePerc",data=solos,color="black",alpha=0.8) sns.pointplot(x="kills",y="winPlacePerc",data=duos,color="blue",alpha=0.8) sns.pointplot(x="kills",y="winPlacePerc",data=squads,color="yellow",alpha=0.8) plt.title("winPlacePerc in solos vs duos vs squads", fontsize=15) plt.text(47,0.6,"solos",color="black",fontsize=12) plt.text(47,0.55,"duos",color="blue",fontsize=12) plt.text(47,0.5,"squads",color="yellow",fontsize=12) plt.ylabel=("Win Percentage") plt.xlabel=("Number of kills") plt.grid() plt.show()

<Figure size 1200x640 with 0 Axes>

f,ax=plt.subplots(figsize=(15,15),dpi=80) sns.heatmap(train.corr(),annot=True,linewidths=.5,fmt=".1f",ax=ax) plt.show

<function matplotlib.pyplot.show(*args, **kw)>

k=5
f,ax=plt.subplots(figsize=(15,15)) cols=train.corr().nlargest(k,"winPlacePerc")["winPlacePerc"].index cm=np.corrcoef(train[cols].values.T) sns.set(font_scale=1.25) hm=sns.heatmap(cm,cbar=True,annot=True,square=True,fmt=".2f",annot_kws={"size":10},yticklabels=cols.values,xticklabels=cols.values) plt.show()

sns.set()
cols=["winPlacePerc","walkDistance","boosts","weaponsAcquired","damageDealt","killPlace"] data_new=train[cols]

data_new

	winPlacePerc	walkDistance	boosts	weaponsAcquired	damageDealt	killPlace
0	0.4444	244.80	0	1	0.000	60
1	0.6400	1434.00	0	5	91.470	57
2	0.7755	161.80	0	2	68.000	47
3	0.1667	202.70	0	3	32.900	75
4	0.1875	49.75	0	2	100.000	45
5	0.0370	34.70	0	1	100.000	44
6	0.0000	13.50	0	1	0.000	96
7	0.7368	1089.00	0	6	8.538	48
8	0.3704	799.90	0	4	51.600	64
9	0.2143	65.67	0	1	37.270	74
10	0.3929	868.30	0	9	28.380	75
11	0.4043	451.70	0	1	137.900	64
12	0.9286	2784.00	0	6	0.000	37
13	0.8750	2050.00	1	6	324.200	5
14	0.9000	1666.00	1	5	122.800	25
15	0.2766	105.10	0	5	80.710	72
16	0.7308	3674.00	2	7	81.710	25
17	0.8211	1787.00	3	3	254.300	13
18	0.1923	137.40	0	2	0.000	79
19	0.9310	3310.00	1	3	65.280	48
20	0.6383	1794.00	4	5	269.100	18
21	0.2143	580.10	0	2	158.700	75
22	0.7500	1264.00	1	4	192.300	15
23	0.9592	2727.00	6	7	1011.000	2
24	0.9231	3503.00	3	4	327.600	3
25	0.8696	2711.00	4	7	558.600	11
26	0.1154	15.30	0	0	44.280	78
27	0.7234	1933.00	4	4	381.200	7
28	0.9630	3855.00	6	4	345.600	6
29	0.0000	0.00	0	0	0.000	87
...	...	...	...	...	...	...
4446936	0.7308	2708.00	1	7	68.200	22
4446937	0.7111	1364.00	2	5	127.400	31
4446938	0.0385	173.10	0	1	0.000	64
4446939	0.3830	56.14	0	1	151.500	35
4446940	0.6250	1362.00	1	6	0.000	58
4446941	0.1600	65.21	0	2	62.350	79
4446942	1.0000	2162.00	4	8	724.700	1
4446943	0.1111	57.59	0	2	0.000	82
4446944	0.6875	1349.00	0	5	175.000	29
4446945	0.1875	57.19	0	1	0.000	81
4446946	0.7292	2591.00	0	7	0.000	53
4446947	0.3830	631.10	0	4	0.000	61
4446948	0.7917	1685.00	3	3	736.500	7
4446949	0.1458	424.60	0	3	100.000	32
4446950	0.5000	1559.00	0	5	203.500	32
4446951	0.1000	44.90	0	1	0.000	85
4446952	0.8462	1177.00	0	5	0.000	44
4446953	0.5926	1025.00	0	5	30.100	57
4446954	0.5306	2146.00	0	6	30.100	58
4446955	0.4792	1158.00	0	3	0.000	60
4446956	0.1071	828.30	0	7	151.900	77
4446957	0.4583	363.70	1	2	100.000	32
4446958	0.0000	0.00	0	0	0.000	92
4446959	0.0842	40.25	0	1	22.680	89
4446960	0.2414	845.60	0	3	327.700	4
4446961	0.1786	1019.00	0	3	0.000	74
4446962	0.2935	81.70	1	6	44.150	69
4446963	0.4815	788.70	0	4	59.060	66
4446964	0.8000	2748.00	4	8	180.400	11
4446965	0.5464	1244.00	2	5	268.000	18

4446966 rows × 6 columns

data_new=data_new.iloc[:10000] sns.pairplot(data_new,height=2.5) plt.show()

机器学习之数据处理

猜你喜欢