import pandas as pd
import numpy as np
string_data = pd. Series( [ 'aardvark' , 'artwdfv' , np. nan, 'asdfaa' ] )
string_data
0 aardvark
1 artwdfv
2 NaN
3 asdfaa
dtype: object
string_data. isnull( )
0 False
1 False
2 True
3 False
dtype: bool
string_data[ 0 ] = None
string_data. isnull( )
0 True
1 False
2 True
3 False
dtype: bool
from numpy import nan as NA
data = pd. Series( [ 1 , NA, 3 , 4 , NA, 7 ] )
data. dropna( )
0 1.0
2 3.0
3 4.0
5 7.0
dtype: float64
data[ data. notnull( ) ]
0 1.0
2 3.0
3 4.0
5 7.0
dtype: float64
data = pd. DataFrame( [ [ 1 . , 6.5 , 3 . ] , [ 1 . , NA, NA] ,
[ NA, NA, NA] , [ NA, 6.5 , 3 . ] ] )
cleaned = data. dropna( )
data
0
1
2
0
1.0
6.5
3.0
1
1.0
NaN
NaN
2
NaN
NaN
NaN
3
NaN
6.5
3.0
cleaned
data. dropna( how= 'all' )
0
1
2
0
1.0
6.5
3.0
1
1.0
NaN
NaN
3
NaN
6.5
3.0
data[ 4 ] = NA
data
0
1
2
4
0
1.0
6.5
3.0
NaN
1
1.0
NaN
NaN
NaN
2
NaN
NaN
NaN
NaN
3
NaN
6.5
3.0
NaN
data. dropna( axis= 1 , how= 'all' )
0
1
2
0
1.0
6.5
3.0
1
1.0
NaN
NaN
2
NaN
NaN
NaN
3
NaN
6.5
3.0
df = pd. DataFrame( np. random. randn( 7 , 3 ) )
df. iloc[ : 4 , 1 ] = NA
df. iloc[ : 2 , 2 ] = NA
df
0
1
2
0
0.468787
NaN
NaN
1
0.903261
NaN
NaN
2
1.453601
NaN
1.693059
3
1.053961
NaN
-0.147527
4
0.405867
1.042093
-1.693640
5
-0.416778
-0.802466
2.841372
6
0.348987
-1.585632
0.061224
df. dropna( )
0
1
2
4
0.405867
1.042093
-1.693640
5
-0.416778
-0.802466
2.841372
6
0.348987
-1.585632
0.061224
df. dropna( thresh= 2 )
0
1
2
2
1.453601
NaN
1.693059
3
1.053961
NaN
-0.147527
4
0.405867
1.042093
-1.693640
5
-0.416778
-0.802466
2.841372
6
0.348987
-1.585632
0.061224
df. fillna( 0 )
0
1
2
0
0.468787
0.000000
0.000000
1
0.903261
0.000000
0.000000
2
1.453601
0.000000
1.693059
3
1.053961
0.000000
-0.147527
4
0.405867
1.042093
-1.693640
5
-0.416778
-0.802466
2.841372
6
0.348987
-1.585632
0.061224
df. fillna( { 1 : 0 , 2 : 0.5 } )
0
1
2
0
0.468787
0.000000
0.500000
1
0.903261
0.000000
0.500000
2
1.453601
0.000000
1.693059
3
1.053961
0.000000
-0.147527
4
0.405867
1.042093
-1.693640
5
-0.416778
-0.802466
2.841372
6
0.348987
-1.585632
0.061224
_ = df. fillna( 0 , inplace= True )
df
0
1
2
0
0.468787
0.000000
0.000000
1
0.903261
0.000000
0.000000
2
1.453601
0.000000
1.693059
3
1.053961
0.000000
-0.147527
4
0.405867
1.042093
-1.693640
5
-0.416778
-0.802466
2.841372
6
0.348987
-1.585632
0.061224
df = pd. DataFrame( np. random. randn( 6 , 3 ) )
df. iloc[ 2 : , 1 ] = NA
df. iloc[ 4 : , 2 ] = NA
df
0
1
2
0
1.813182
2.118317
0.654455
1
0.404148
0.387881
-0.082305
2
0.841433
NaN
-0.922404
3
-0.569958
NaN
1.136830
4
1.007093
NaN
NaN
5
1.725698
NaN
NaN
df. fillna( method= 'ffill' )
0
1
2
0
1.813182
2.118317
0.654455
1
0.404148
0.387881
-0.082305
2
0.841433
0.387881
-0.922404
3
-0.569958
0.387881
1.136830
4
1.007093
0.387881
1.136830
5
1.725698
0.387881
1.136830
data = data = pd. DataFrame( { 'k1' : [ 'one' , 'two' ] * 3 + [ 'two' ] ,
'k2' : [ 1 , 1 , 2 , 3 , 3 , 4 , 4 ] } )
data
k1
k2
0
one
1
1
two
1
2
one
2
3
two
3
4
one
3
5
two
4
6
two
4
data. duplicated( )
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data. drop_duplicates( )
k1
k2
0
one
1
1
two
1
2
one
2
3
two
3
4
one
3
5
two
4
data[ 'v1' ] = range ( 7 )
data. drop_duplicates( [ 'k1' ] )
k1
k2
v1
0
one
1
0
1
two
1
1
data. drop_duplicates( [ 'k1' , 'k2' ] , keep= 'last' )
k1
k2
v1
0
one
1
0
1
two
1
1
2
one
2
2
3
two
3
3
4
one
3
4
6
two
4
6
data = pd. DataFrame( { 'food' : [ 'bacon' , 'pulled pork' , 'bacon' ,
'Pastrami' , 'corned beef' , 'Bacon' ,
'pastrami' , 'honey ham' , 'nova lox' ] ,
'ounces' : [ 4 , 3 , 12 , 6 , 7.5 , 8 , 3 , 5 , 6 ] } )
data
food
ounces
0
bacon
4.0
1
pulled pork
3.0
2
bacon
12.0
3
Pastrami
6.0
4
corned beef
7.5
5
Bacon
8.0
6
pastrami
3.0
7
honey ham
5.0
8
nova lox
6.0
meat_to_animal = {
'bacon' : 'pig' ,
'pulled pork' : 'pig' ,
'pastrami' : 'cow' ,
'corned beef' : 'cow' ,
'honey ham' : 'pig' ,
'nova lox' : 'salmon'
}
lowercased = data[ 'food' ] . str . lower( )
lowercased
0 bacon
1 pulled pork
2 bacon
3 pastrami
4 corned beef
5 bacon
6 pastrami
7 honey ham
8 nova lox
Name: food, dtype: object
data[ 'animal' ] = lowercased. map ( meat_to_animal)
data
food
ounces
animal
0
bacon
4.0
pig
1
pulled pork
3.0
pig
2
bacon
12.0
pig
3
Pastrami
6.0
cow
4
corned beef
7.5
cow
5
Bacon
8.0
pig
6
pastrami
3.0
cow
7
honey ham
5.0
pig
8
nova lox
6.0
salmon
data[ 'food' ] . map ( lambda x : meat_to_animal[ x. lower( ) ] )
0 pig
1 pig
2 pig
3 cow
4 cow
5 pig
6 cow
7 pig
8 salmon
Name: food, dtype: object
data = pd. Series( [ 1 , - 999 , 2 , - 999 , - 1000 , 3 ] )
data
0 1
1 -999
2 2
3 -999
4 -1000
5 3
dtype: int64
data. replace( - 999 , np. nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 -1000.0
5 3.0
dtype: float64
data. replace( [ - 999 , - 1000 ] , np. nan)
0 1.0
1 NaN
2 2.0
3 NaN
4 NaN
5 3.0
dtype: float64
data. replace( { - 999 : np. nan, - 1000 : 0 } )
0 1.0
1 NaN
2 2.0
3 NaN
4 0.0
5 3.0
dtype: float64
data = pd. DataFrame( np. arange( 12 ) . reshape( ( 3 , 4 ) ) ,
index= [ 'Ohio' , 'Colorado' , 'New York' ] ,
columns= [ 'one' , 'two' , 'three' , 'four' ] )
transform = lambda x: x[ : 4 ] . upper( )
data. index. map ( transform)
Index(['OHIO', 'COLO', 'NEW '], dtype='object')
data. index = data. index. map ( transform)
data
one
two
three
four
OHIO
0
1
2
3
COLO
4
5
6
7
NEW
8
9
10
11
data. rename( index= str . title, columns= str . upper)
ONE
TWO
THREE
FOUR
Ohio
0
1
2
3
Colo
4
5
6
7
New
8
9
10
11
data. rename( index= { 'OHIO' : 'INDIANA' } ,
columns= { 'three' : 'peekaboo' } )
one
two
peekaboo
four
INDIANA
0
1
2
3
COLO
4
5
6
7
NEW
8
9
10
11
data. rename( index= { 'OHIO' : 'INDIANA' } , inplace= True )
data
one
two
three
four
INDIANA
0
1
2
3
COLO
4
5
6
7
NEW
8
9
10
11
ages = [ 20 , 22 , 25 , 27 , 21 , 23 , 37 , 31 , 61 , 45 , 41 , 32 ]
bins = [ 18 , 25 , 35 , 60 , 100 ]
cats = pd. cut( ages, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
cats. codes
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)
cats. categories
IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
closed='right',
dtype='interval[int64]')
pd. value_counts( cats)
(18, 25] 5
(35, 60] 3
(25, 35] 3
(60, 100] 1
dtype: int64
pd. cut( ages, [ 18 , 26 , 36 , 61 , 100 ] , right= False )
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]
group_names = [ 'Youth' , 'YoungAdult' , 'MiddleAged' , 'Senior' ]
pd. cut( ages, bins, labels= group_names)
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]
data = np. random. randn( 20 )
data
array([ 1.91724059, 0.71063941, -0.61160619, -0.83774853, -0.30427484,
-0.13651668, 0.12231811, 1.02349581, 0.44230242, 2.5811469 ,
0.84007075, -0.40956094, 1.87198738, -1.69861267, -0.52190509,
-0.1944561 , -0.44986769, 0.64421648, 1.96899093, 0.04159415])
pd. cut( data, 4 , precision= 2 )
[(1.51, 2.58], (0.44, 1.51], (-0.63, 0.44], (-1.7, -0.63], (-0.63, 0.44], ..., (-0.63, 0.44], (-0.63, 0.44], (0.44, 1.51], (1.51, 2.58], (-0.63, 0.44]]
Length: 20
Categories (4, interval[float64]): [(-1.7, -0.63] < (-0.63, 0.44] < (0.44, 1.51] < (1.51, 2.58]]
data = np. random. randn( 1000 )
cats = pd. qcut( data, 4 )
cats
[(-0.65, 0.0814], (-0.65, 0.0814], (0.0814, 0.727], (0.0814, 0.727], (-2.875, -0.65], ..., (0.0814, 0.727], (-2.875, -0.65], (-0.65, 0.0814], (-0.65, 0.0814], (-0.65, 0.0814]]
Length: 1000
Categories (4, interval[float64]): [(-2.875, -0.65] < (-0.65, 0.0814] < (0.0814, 0.727] < (0.727, 3.834]]
pd. value_counts( cats)
(0.727, 3.834] 250
(0.0814, 0.727] 250
(-0.65, 0.0814] 250
(-2.875, -0.65] 250
dtype: int64
pd. qcut( data, [ 0 , 0.1 , 0.5 , 0.9 , 1 . ] )
[(-1.237, 0.0814], (-1.237, 0.0814], (0.0814, 1.324], (0.0814, 1.324], (-2.875, -1.237], ..., (0.0814, 1.324], (-1.237, 0.0814], (-1.237, 0.0814], (-1.237, 0.0814], (-1.237, 0.0814]]
Length: 1000
Categories (4, interval[float64]): [(-2.875, -1.237] < (-1.237, 0.0814] < (0.0814, 1.324] < (1.324, 3.834]]
data = pd. DataFrame( np. random. randn( 1000 , 4 ) )
data. describe( )
0
1
2
3
count
1000.000000
1000.000000
1000.000000
1000.000000
mean
-0.088724
0.021011
0.043887
0.006012
std
0.990026
0.982459
0.970484
1.013532
min
-3.417757
-3.501364
-2.653510
-3.266161
25%
-0.722939
-0.618738
-0.637500
-0.723452
50%
-0.070858
0.047673
0.011295
0.017201
75%
0.578929
0.689053
0.735396
0.685065
max
2.695907
3.217885
3.304064
3.158566
col = data[ 2 ]
col[ np. abs ( col) > 3 ]
583 3.304064
Name: 2, dtype: float64
data[ ( np. abs ( data) > 3 ) . any ( 1 ) ]
0
1
2
3
37
-0.327884
2.157466
-0.043636
3.073042
152
-3.417757
-0.061750
-0.935451
-0.627025
175
0.578744
-0.562655
-1.122764
3.140705
232
-3.108754
0.673518
0.165646
0.924763
292
1.270998
3.217885
0.172434
-0.872227
417
0.705947
-0.002233
1.380826
-3.266161
487
-3.008020
-0.298071
-0.048238
0.680068
512
0.165514
-3.501364
-1.157821
0.817954
583
-1.525473
-1.329746
3.304064
-2.202428
813
-0.230513
0.459634
0.130212
3.158566
df = pd. DataFrame( np. arange( 5 * 4 ) . reshape( 5 , 4 ) )
sampler = np. random. permutation( 5 )
sampler
array([2, 3, 0, 1, 4])
df
0
1
2
3
0
0
1
2
3
1
4
5
6
7
2
8
9
10
11
3
12
13
14
15
4
16
17
18
19
df. take( sampler)
0
1
2
3
2
8
9
10
11
3
12
13
14
15
0
0
1
2
3
1
4
5
6
7
4
16
17
18
19
df. sample( n= 3 )
0
1
2
3
0
0
1
2
3
3
12
13
14
15
1
4
5
6
7
choices = pd. Series( [ 5 , 7 , - 1 , 6 , 4 ] )
draws = choices. sample( n= 10 , replace= True )
draws
0 5
0 5
1 7
3 6
2 -1
1 7
4 4
2 -1
2 -1
1 7
dtype: int64
df = pd. DataFrame( { 'key' : [ 'b' , 'b' , 'a' , 'c' , 'a' , 'b' ] ,
'data1' : range ( 6 ) } )
df
key
data1
0
b
0
1
b
1
2
a
2
3
c
3
4
a
4
5
b
5
pd. get_dummies( df[ 'key' ] )
a
b
c
0
0
1
0
1
0
1
0
2
1
0
0
3
0
0
1
4
1
0
0
5
0
1
0
dummies = pd. get_dummies( df[ 'key' ] , prefix= 'key' )
df_with_dummy = df[ [ 'data1' ] ] . join( dummies)
df_with_dummy
data1
key_a
key_b
key_c
0
0
0
1
0
1
1
0
1
0
2
2
1
0
0
3
3
0
0
1
4
4
1
0
0
5
5
0
1
0
mnames = [ 'movie_id' , 'title' , 'genres' ]
movies = pd. read_table( 'datasets/movielens/movies.dat' , sep= '::' ,
header= None , names= mnames)
movies[ : 10 ]
C:\Anaconda\lib\site-packages\ipykernel_launcher.py:3: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
This is separate from the ipykernel package so we can avoid doing imports until
movie_id
title
genres
0
1
Toy Story (1995)
Animation|Children's|Comedy
1
2
Jumanji (1995)
Adventure|Children's|Fantasy
2
3
Grumpier Old Men (1995)
Comedy|Romance
3
4
Waiting to Exhale (1995)
Comedy|Drama
4
5
Father of the Bride Part II (1995)
Comedy
5
6
Heat (1995)
Action|Crime|Thriller
6
7
Sabrina (1995)
Comedy|Romance
7
8
Tom and Huck (1995)
Adventure|Children's
8
9
Sudden Death (1995)
Action
9
10
GoldenEye (1995)
Action|Adventure|Thriller
all_genres = [ ]
for x in movies. genres:
all_genres. extend( x. split( '|' ) )
genres = pd. unique( all_genres)
genres
array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
'Western'], dtype=object)
zero_matrix = np. zeros( ( len ( movies) , len ( genres) ) )
zero_matrix
array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]])
dummies = pd. DataFrame( zero_matrix, columns= genres)
dummies
Animation
Children's
Comedy
Adventure
Fantasy
Romance
Drama
Action
Crime
Thriller
Horror
Sci-Fi
Documentary
War
Musical
Mystery
Film-Noir
Western
0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
6
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
7
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
8
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
9
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
11
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
12
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
13
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
15
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
16
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
17
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
18
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
19
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
20
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
21
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
22
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
23
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
24
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
25
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
26
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
27
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
28
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
29
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
3853
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3854
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3855
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3856
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3857
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3858
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3859
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3860
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3861
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3862
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3863
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3864
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3865
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3866
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3867
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3868
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3869
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3870
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3871
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3872
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3873
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3874
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3875
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3876
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3877
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3878
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3879
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3880
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3881
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3882
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3883 rows × 18 columns
gen = movies. genres[ 0 ]
gen. split( '|' )
['Animation', "Children's", 'Comedy']
dummies. columns. get_indexer( gen. split( '|' ) )
array([0, 1, 2], dtype=int64)
for i, gen in enumerate ( movies. genres) :
indices = dummies. columns. get_indexer( gen. split( '|' ) )
dummies. iloc[ i, indices] = 1
dummies
Animation
Children's
Comedy
Adventure
Fantasy
Romance
Drama
Action
Crime
Thriller
Horror
Sci-Fi
Documentary
War
Musical
Mystery
Film-Noir
Western
0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1
0.0
1.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
2
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
4
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
6
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
7
0.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
8
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
9
0.0
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10
0.0
0.0
1.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
11
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
12
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
13
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
14
0.0
0.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
15
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
16
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
17
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
18
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
19
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
20
0.0
0.0
1.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
21
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
22
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
23
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
24
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
25
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
26
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
27
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
28
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
29
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
3853
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
3854
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3855
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3856
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
3857
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
3858
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3859
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3860
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3861
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
3862
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
3863
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
3864
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
3865
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3866
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3867
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
3868
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3869
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3870
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3871
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3872
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3873
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3874
0.0
0.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3875
1.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3876
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3877
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3878
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3879
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3880
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3881
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3882
0.0
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
3883 rows × 18 columns
movies_windic = movies. join( dummies. add_prefix( 'Game_' ) )
movies_windic. iloc[ 0 ]
movie_id 1
title Toy Story (1995)
genres Animation|Children's|Comedy
Game_Animation 1
Game_Children's 1
Game_Comedy 1
Game_Adventure 0
Game_Fantasy 0
Game_Romance 0
Game_Drama 0
Game_Action 0
Game_Crime 0
Game_Thriller 0
Game_Horror 0
Game_Sci-Fi 0
Game_Documentary 0
Game_War 0
Game_Musical 0
Game_Mystery 0
Game_Film-Noir 0
Game_Western 0
Name: 0, dtype: object
np. random. seed( 12345 )
values = np. random. rand( 10 )
values
array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])
bins = [ 0 , 0.2 , 0.4 , 0.6 , 0.8 , 1 ]
pd. get_dummies( pd. cut( values, bins) )
(0.0, 0.2]
(0.2, 0.4]
(0.4, 0.6]
(0.6, 0.8]
(0.8, 1.0]
0
0
0
0
0
1
1
0
1
0
0
0
2
1
0
0
0
0
3
0
1
0
0
0
4
0
0
1
0
0
5
0
0
1
0
0
6
0
0
0
0
1
7
0
0
0
1
0
8
0
0
0
1
0
9
0
0
0
1
0