当多个观察单位被存储于同一张表时进行清理

# 读取movie_altered数据集
movie = pd.read_csv('data/movie_altered.csv')
movie.head()
title rating year duration director_1 director_fb_likes_1 actor_1 actor_2 actor_3 actor_fb_likes_1 actor_fb_likes_2 actor_fb_likes_3
0 Avatar PG-13 2009.0 178.0 James Cameron 0.0 CCH Pounder Joel David Moore Wes Studi 1000.0 936.0 855.0
1 Pirates of the Caribbean: At World's End PG-13 2007.0 169.0 Gore Verbinski 563.0 Johnny Depp Orlando Bloom Jack Davenport 40000.0 5000.0 1000.0
2 Spectre PG-13 2015.0 148.0 Sam Mendes 0.0 Christoph Waltz Rory Kinnear Stephanie Sigman 11000.0 393.0 161.0
3 The Dark Knight Rises PG-13 2012.0 164.0 Christopher Nolan 22000.0 Tom Hardy Christian Bale Joseph Gordon-Levitt 27000.0 23000.0 23000.0
4 Star Wars: Episode VII - The Force Awakens NaN NaN NaN Doug Walker 131.0 Doug Walker Rob Walker NaN 131.0 12.0 NaN
# 插入新的列,用来标识每一部电影
movie.insert(0, 'id', np.arange(len(movie)))
movie.head()
  id title rating year duration director_1 director_fb_likes_1 actor_1 actor_2 actor_3 actor_fb_likes_1 actor_fb_likes_2 actor_fb_likes_3
0 0 Avatar PG-13 2009.0 178.0 James Cameron 0.0 CCH Pounder Joel David Moore Wes Studi 1000.0 936.0 855.0
1 1 Pirates of the Caribbean: At World's End PG-13 2007.0 169.0 Gore Verbinski 563.0 Johnny Depp Orlando Bloom Jack Davenport 40000.0 5000.0 1000.0
2 2 Spectre PG-13 2015.0 148.0 Sam Mendes 0.0 Christoph Waltz Rory Kinnear Stephanie Sigman 11000.0 393.0 161.0
3 3 The Dark Knight Rises PG-13 2012.0 164.0 Christopher Nolan 22000.0 Tom Hardy Christian Bale Joseph Gordon-Levitt 27000.0 23000.0 23000.0
4 4 Star Wars: Episode VII - The Force Awakens NaN NaN NaN Doug Walker 131.0 Doug Walker Rob Walker NaN 131.0 12.0 NaN
# 用wide_to_long,将所有演员放到一列,将所有Facebook likes放到一列
stubnames = ['director', 'director_fb_likes', 'actor', 'actor_fb_likes']
movie_long = pd.wide_to_long(movie, 
                                      stubnames=stubnames, 
                                      i='id', 
                                      j='num', 
                                      sep='_').reset_index()
movie_long['num'] = movie_long['num'].astype(int)
movie_long.head(9)
  id num title duration year rating director director_fb_likes actor actor_fb_likes
0 0 1 Avatar 178.0 2009.0 PG-13 James Cameron 0.0 CCH Pounder 1000.0
1 0 2 Avatar 178.0 2009.0 PG-13 NaN NaN Joel David Moore 936.0
2 0 3 Avatar 178.0 2009.0 PG-13 NaN NaN Wes Studi 855.0
3 1 1 Pirates of the Caribbean: At World's End 169.0 2007.0 PG-13 Gore Verbinski 563.0 Johnny Depp 40000.0
4 1 2 Pirates of the Caribbean: At World's End 169.0 2007.0 PG-13 NaN NaN Orlando Bloom 5000.0
5 1 3 Pirates of the Caribbean: At World's End 169.0 2007.0 PG-13 NaN NaN Jack Davenport 1000.0
6 2 1 Spectre 148.0 2015.0 PG-13 Sam Mendes 0.0 Christoph Waltz 11000.0
7 2 2 Spectre 148.0 2015.0 PG-13 NaN NaN Rory Kinnear 393.0
8 2 3 Spectre 148.0 2015.0 PG-13 NaN NaN Stephanie Sigman 161.0
# 将这个数据分解成多个小表
movie_table = movie_long[['id','title', 'year', 'duration', 'rating']]
director_table = movie_long[['id', 'director', 'num', 'director_fb_likes']]
actor_table = movie_long[['id', 'actor', 'num', 'actor_fb_likes']]

movie_table.head(9)
  id title year duration rating
0 0 Avatar 2009.0 178.0 PG-13
1 0 Avatar 2009.0 178.0 PG-13
2 0 Avatar 2009.0 178.0 PG-13
3 1 Pirates of the Caribbean: At World's End 2007.0 169.0 PG-13
4 1 Pirates of the Caribbean: At World's End 2007.0 169.0 PG-13
5 1 Pirates of the Caribbean: At World's End 2007.0 169.0 PG-13
6 2 Spectre 2015.0 148.0 PG-13
7 2 Spectre 2015.0 148.0 PG-13
8 2 Spectre 2015.0 148.0 PG-13
director_table.head(9)
  id director num director_fb_likes
0 0 James Cameron 1 0.0
1 0 NaN 2 NaN
2 0 NaN 3 NaN
3 1 Gore Verbinski 1 563.0
4 1 NaN 2 NaN
5 1 NaN 3 NaN
6 2 Sam Mendes 1 0.0
7 2 NaN 2 NaN
8 2 NaN 3 NaN
actor_table.head(9)
  id actor num actor_fb_likes
0 0 CCH Pounder 1 1000.0
1 0 Joel David Moore 2 936.0
2 0 Wes Studi 3 855.0
3 1 Johnny Depp 1 40000.0
4 1 Orlando Bloom 2 5000.0
5 1 Jack Davenport 3 1000.0
6 2 Christoph Waltz 1 11000.0
7 2 Rory Kinnear 2 393.0
8 2 Stephanie Sigman 3 161.0
# 做一些去重和去除缺失值的工作
movie_table = movie_table.drop_duplicates().reset_index(drop=True)
director_table = director_table.dropna().reset_index(drop=True)
actor_table = actor_table.dropna().reset_index(drop=True)
movie_table.head()
  id title year duration rating
0 0 Avatar 2009.0 178.0 PG-13
1 1 Pirates of the Caribbean: At World's End 2007.0 169.0 PG-13
2 2 Spectre 2015.0 148.0 PG-13
3 3 The Dark Knight Rises 2012.0 164.0 PG-13
4 4 Star Wars: Episode VII - The Force Awakens NaN NaN NaN
director_table.head()
  id director num director_fb_likes
0 0 James Cameron 1 0.0
1 1 Gore Verbinski 1 563.0
2 2 Sam Mendes 1 0.0
3 3 Christopher Nolan 1 22000.0
4 4 Doug Walker 1 131.0
# 比较内存的使用量 ##如果设置为 TRUE,获取系统分配的真实内存尺寸
movie.memory_usage(deep=True).sum()
#2289818
movie_table.memory_usage(deep=True).sum() + \
         director_table.memory_usage(deep=True).sum() + \
         actor_table.memory_usage(deep=True).sum()
#2538166
# 创建演员和导演的id列
director_cat = pd.Categorical(director_table['director'])
director_table.insert(1, 'director_id', director_cat.codes)

actor_cat = pd.Categorical(actor_table['actor'])
actor_table.insert(1, 'actor_id', actor_cat.codes)

director_table.head()
  id director_id director num director_fb_likes
0 0 922 James Cameron 1 0.0
1 1 794 Gore Verbinski 1 563.0
2 2 2020 Sam Mendes 1 0.0
3 3 373 Christopher Nolan 1 22000.0
4 4 600 Doug Walker 1 131.0
actor_table.head()
  id actor_id actor num actor_fb_likes
0 0 824 CCH Pounder 1 1000.0
1 0 2867 Joel David Moore 2 936.0
2 0 6099 Wes Studi 3 855.0
3 1 2971 Johnny Depp 1 40000.0
4 1 4536 Orlando Bloom 2 5000.0
# 可以用这两张表生成要用的中间表。先来做director表
director_associative = director_table[['id', 'director_id', 'num']]
dcols = ['director_id', 'director', 'director_fb_likes']
director_unique = director_table[dcols].drop_duplicates().reset_index(drop=True)
director_associative.head() 
  id director_id num
0 0 922 1
1 1 794 1
2 2 2020 1
3 3 373 1
4 4 600 1
director_unique.head()
  director_id director director_fb_likes
0 922 James Cameron 0.0
1 794 Gore Verbinski 563.0
2 2020 Sam Mendes 0.0
3 373 Christopher Nolan 22000.0
4 600 Doug Walker 131.0
# 再来做actor表
actor_associative = actor_table[['id', 'actor_id', 'num']]
acols = ['actor_id', 'actor', 'actor_fb_likes']
actor_unique = actor_table[acols].drop_duplicates().reset_index(drop=True)
actor_associative.head()
  id actor_id num
0 0 824 1
1 0 2867 2
2 0 6099 3
3 1 2971 1
4 1 4536 2
actor_unique.head()
  actor_id actor actor_fb_likes
0 824 CCH Pounder 1000.0
1 2867 Joel David Moore 936.0
2 6099 Wes Studi 855.0
3 2971 Johnny Depp 40000.0
4 4536 Orlando Bloom 5000.0
# 查看新的表所使用的内存量
movie_table.memory_usage(deep=True).sum() + \
      director_associative.memory_usage(deep=True).sum() + \
      director_unique.memory_usage(deep=True).sum() + \
      actor_associative.memory_usage(deep=True).sum() + \
      actor_unique.memory_usage(deep=True).sum()
#1746766
movie_table.head()
  id title year duration rating
0 0 Avatar 2009.0 178.0 PG-13
1 1 Pirates of the Caribbean: At World's End 2007.0 169.0 PG-13
2 2 Spectre 2015.0 148.0 PG-13
3 3 The Dark Knight Rises 2012.0 164.0 PG-13
4 4 Star Wars: Episode VII - The Force Awakens NaN NaN NaN
# 可以通过将左右表组合起来形成movie表。首先将附表与actor/director表结合,
#然后将num列pivot,再加上列的前缀
actors = actor_associative.merge(actor_unique, on='actor_id') \
.drop('actor_id', 1) \
.pivot_table(index='id', columns='num', aggfunc='first')

actors.columns = actors.columns.get_level_values(0) + '_' + \
actors.columns.get_level_values(1).astype(str)

directors = director_associative.merge(director_unique, on='director_id') \
.drop('director_id', 1) \
.pivot_table(index='id', columns='num', aggfunc='first')

directors.columns = directors.columns.get_level_values(0) + '_' + \
directors.columns.get_level_values(1).astype(str)
actors.head()
  actor_1 actor_2 actor_3 actor_fb_likes_1 actor_fb_likes_2 actor_fb_likes_3
id            
0 CCH Pounder Joel David Moore Wes Studi 1000.0 936.0 855.0
1 Johnny Depp Orlando Bloom Jack Davenport 40000.0 5000.0 1000.0
2 Christoph Waltz Rory Kinnear Stephanie Sigman 11000.0 393.0 161.0
3 Tom Hardy Christian Bale Joseph Gordon-Levitt 27000.0 23000.0 23000.0
4 Doug Walker Rob Walker NaN 131.0 12.0 NaN
directors.head()
  director_1 director_fb_likes_1
id    
0 James Cameron 0.0
1 Gore Verbinski 563.0
2 Sam Mendes 0.0
3 Christopher Nolan 22000.0
4 Doug Walker 131.0
movie2 = movie_table.merge(directors.reset_index(), on='id', how='left') \
                              .merge(actors.reset_index(), on='id', how='left')
movie2.head()
  id title year duration rating director_1 director_fb_likes_1 actor_1 actor_2 actor_3 actor_fb_likes_1 actor_fb_likes_2 actor_fb_likes_3
0 0 Avatar 2009.0 178.0 PG-13 James Cameron 0.0 CCH Pounder Joel David Moore Wes Studi 1000.0 936.0 855.0
1 1 Pirates of the Caribbean: At World's End 2007.0 169.0 PG-13 Gore Verbinski 563.0 Johnny Depp Orlando Bloom Jack Davenport 40000.0 5000.0 1000.0
2 2 Spectre 2015.0 148.0 PG-13 Sam Mendes 0.0 Christoph Waltz Rory Kinnear Stephanie Sigman 11000.0 393.0 161.0
3 3 The Dark Knight Rises 2012.0 164.0 PG-13 Christopher Nolan 22000.0 Tom Hardy Christian Bale Joseph Gordon-Levitt 27000.0 23000.0 23000.0
4 4 Star Wars: Episode VII - The Force Awakens NaN NaN NaN Doug Walker 131.0 Doug Walker Rob Walker NaN 131.0 12.0 NaN

猜你喜欢

转载自blog.csdn.net/weixin_48135624/article/details/114196687