37.大数据处理技巧

思路：将int64转换为int32，将float64转换为float32，将大量重复的object转换为category

演示数据下载地址：https://download.csdn.net/download/xzy53719/10679389

import pandas as pd
import numpy as np
g1 = pd.read_csv('game_logs.csv')
g1.head()

#查看数据数量，行、列
g1.shape

(171907, 161)

#数据占用内存情况
g1.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(77), int64(6), object(78)
memory usage: 860.5 MB

#每种类型，占用内存情况
for dtype in ['float64','int64','object']:
    selected_dtype = g1.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024**2
    print('平均内存占用：',dtype,mean_usage_mb)

平均内存占用： float64 1.2947326073279748
平均内存占用： int64 1.1241934640066964
平均内存占用： object 9.514454069016855

#不同的int类型，最大值与最小值。
int_types=['uint8','int8','int16','int32','int64']
for it in int_types:
    print(np.iinfo(it))

Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------

#计算内存占用量
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else:#Series类型
        usage_b = pandas_obj.memory_usage(deep=True)
    return '{:03.2f} MB'.format(usage_b/1024**2)

#减少内存占用
#思路：int64 -> int32
#取出g1中int64的列
g1_int = g1.select_dtypes(include=['int64'])
converted_int = g1_int.apply(pd.to_numeric,downcast='unsigned')
print('int原大小：',mem_usage(g1_int))
print('int转换后：',mem_usage(converted_int))
#减少内存占用，将float64 -> float32
g1_float = g1.select_dtypes(include=['float64'])
converted_float = g1_float.apply(pd.to_numeric,downcast='float')
print('float原大小：',mem_usage(g1_float))
print('float转换后：',mem_usage(converted_float))

int原大小： 7.87 MB
int转换后： 1.48 MB
float原大小： 100.99 MB
float转换后： 50.49 MB

#将数据重复较多的，转换为category类型
converted_obj = pd.DataFrame()
for col in g1_obj.columns:
    num_unique_values = len(g1_obj[col].unique())
    num_total_values = len(g1_obj[col])
    #若不重复的数据数量，占总数的50%以下，则说明重复数据较多，适合转换为category
    if num_unique_values/num_total_values < 0.5:
        converted_obj.loc[:,col] = g1_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = g1_obj[col]
print("原大小：",mem_usage(g1_obj))
print("转换前：",mem_usage(converted_obj))

原大小： 751.64 MB
转换前： 51.67 MB

#转换前后数据内存占用对比
optimized_g1 = g1.copy()
optimized_g1[converted_int.columns] = converted_int
optimized_g1[converted_float.columns] = converted_float
optimized_g1[converted_obj.columns] = converted_obj

print("转换前：",mem_usage(g1))
print("转换后：",mem_usage(optimized_g1))

转换前： 860.50 MB
转换后： 103.64 MB

#时间类型，若转换，可能造成内存占用更多
date = optimized_g1.date
print(date[:5])
#查看内存占用
print(mem_usage(optimized_g1['date']))
optimized_g1['date'] = pd.to_datetime(date,format='%Y%m%d')
print(mem_usage(optimized_g1['date']))

0    18710504
1    18710505
2    18710506
3    18710508
4    18710509
Name: date, dtype: uint32
0.66 MB
1.31 MB

37.大数据处理技巧

猜你喜欢