análisis de los datos
import warnings
warnings. filterwarnings( 'ignore' )
import missingno as msno
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
import numpy as np
train_data = pd. read_csv( 'train.csv' )
test_data = pd. read_csv( 'testA.csv' )
Todos los conjuntos de funciones están insensibilizados
train_data. head( ) . append( train_data. tail( ) )
identificación
heartbeat_signals
etiqueta
0
0
0.9912297987616655,0.9435330436439665,0.764677 ...
0.0
1
1
0.9714822034884503,0.9289687459588268,0.572932 ...
0.0
2
2
1.0,0.9591487564065292,0.7013782792997189,0.23 ...
2.0
3
3
0.9757952826275774,0.9340884687738161,0.659636 ...
0.0
4
4
0.0,0.055816398940721094,0.26129357194994196,0 ...
2.0
99995
99995
1.0,0.677705342021188,0.22239242747868546,0.25 ...
0.0
99996
99996
0.9268571578157265,0.9063471198026871,0.636993 ...
2.0
99997
99997
0.9258351628306013,0.5873839035878395,0.633226 ...
3,0
99998
99998
1.0,0.9947621698382489,0.8297017704865509,0.45 ...
2.0
99999
99999
0.9259994004527861,0.916476635326053,0.4042900 ...
0.0
train_data. shape
(100000, 3)
test_data. head( ) . append( test_data. tail( ) )
identificación
heartbeat_signals
0
100000
0.9915713654170097,1.0,0.6318163407681274,0.13 ...
1
100001
0.6075533139615096,0.5417083883163654,0.340694 ...
2
100002
0.9752726292239277,0.6710965234906665,0.686758 ...
3
100003
0.9956348033996116,0.9170249621481004,0.521096 ...
4
100004
1.0,0.8879490481178918,0.745564725322326,0.531 ...
19995
119995
1.0,0.8330283177934747,0.6340472606311671,0.63 ...
19996
119996
1.0,0.8259705825857048,0.4521053488322387,0.08 ...
19997
119997
0.951744840752379,0.9162611283848351,0.6675251 ...
19998
119998
0.9276692903808186,0.6771898159607004,0.242906 ...
19999
119999
0.6653212231837624,0.527064114047737,0.5166625 ...
test_data. shape
(20000, 2)
Resumen resumen de datos
train_data. describe( )
identificación
etiqueta
contar
100000.000000
100000.000000
significar
49999.500000
0.856960
std
28867.657797
1.217084
min
0,000000
0,000000
25%
24999.750000
0,000000
50%
49999.500000
0,000000
75%
74999.250000
2.000000
max
99999.000000
3.000000
train_data. info
<bound method DataFrame.info of id heartbeat_signals label
0 0 0.9912297987616655,0.9435330436439665,0.764677... 0.0
1 1 0.9714822034884503,0.9289687459588268,0.572932... 0.0
2 2 1.0,0.9591487564065292,0.7013782792997189,0.23... 2.0
3 3 0.9757952826275774,0.9340884687738161,0.659636... 0.0
4 4 0.0,0.055816398940721094,0.26129357194994196,0... 2.0
... ... ... ...
99995 99995 1.0,0.677705342021188,0.22239242747868546,0.25... 0.0
99996 99996 0.9268571578157265,0.9063471198026871,0.636993... 2.0
99997 99997 0.9258351628306013,0.5873839035878395,0.633226... 3.0
99998 99998 1.0,0.9947621698382489,0.8297017704865509,0.45... 2.0
99999 99999 0.9259994004527861,0.916476635326053,0.4042900... 0.0
[100000 rows x 3 columns]>
test_data. describe( )
identificación
contar
20000.000000
significar
109999.500000
std
5773.647028
min
100000.000000
25%
104999.750000
50%
109999.500000
75%
114999.250000
max
119999.000000
test_data. info
<bound method DataFrame.info of id heartbeat_signals
0 100000 0.9915713654170097,1.0,0.6318163407681274,0.13...
1 100001 0.6075533139615096,0.5417083883163654,0.340694...
2 100002 0.9752726292239277,0.6710965234906665,0.686758...
3 100003 0.9956348033996116,0.9170249621481004,0.521096...
4 100004 1.0,0.8879490481178918,0.745564725322326,0.531...
... ... ...
19995 119995 1.0,0.8330283177934747,0.6340472606311671,0.63...
19996 119996 1.0,0.8259705825857048,0.4521053488322387,0.08...
19997 119997 0.951744840752379,0.9162611283848351,0.6675251...
19998 119998 0.9276692903808186,0.6771898159607004,0.242906...
19999 119999 0.6653212231837624,0.527064114047737,0.5166625...
[20000 rows x 2 columns]>
Determinar datos faltantes y anormales
data.isnull (). sum () Ver la existencia de nan en cada columna
train_data. isnull( ) . sum ( )
id 0
heartbeat_signals 0
label 0
dtype: int64
test_data. isnull( ) . sum ( )
id 0
heartbeat_signals 0
dtype: int64
Comprender la distribución de valores predichos
train_data[ 'label' ]
0 0.0
1 0.0
2 2.0
3 0.0
4 2.0
...
99995 0.0
99996 2.0
99997 3.0
99998 2.0
99999 0.0
Name: label, Length: 100000, dtype: float64
train_data[ 'label' ] . value_counts( )
0.0 64327
3.0 17912
2.0 14199
1.0 3562
Name: label, dtype: int64
import scipy. stats as st
y = train_data[ 'label' ]
plt. subplot( 131 )
sns. distplot( y, rug= True , bins= 20 )
plt. subplot( 132 )
sns. distplot( y, kde= False , fit= st. norm)
plt. subplot( 133 )
sns. distplot( y, kde= False , fit= st. lognorm)
plt. show( )
sns. distplot( train_data[ 'label' ] ) ;
print ( "Skewness: %f" % train_data[ 'label' ] . skew( ) )
print ( "Kurtosis: %f" % train_data[ 'label' ] . kurt( ) )
Skewness: 0.871005
Kurtosis: -1.009573
train_data. skew( ) , train_data. kurt( )
(id 0.000000
label 0.871005
dtype: float64,
id -1.200000
label -1.009573
dtype: float64)
sns. distplot( train_data. kurt( ) , color= 'orange' , axlabel= 'Kurtness' )
<AxesSubplot:xlabel='Kurtness', ylabel='Density'>
plt. hist( train_data[ 'label' ] , orientation= 'vertical' , histtype= 'bar' , color= 'red' )
plt. show( )