37.大数据处理技巧

思路:将int64转换为int32,将float64转换为float32,将大量重复的object转换为category

演示数据下载地址:https://download.csdn.net/download/xzy53719/10679389

import pandas as pd
import numpy as np
g1 = pd.read_csv('game_logs.csv')
g1.head()
#查看数据数量,行、列
g1.shape
(171907, 161)
#数据占用内存情况
g1.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171907 entries, 0 to 171906
Columns: 161 entries, date to acquisition_info
dtypes: float64(77), int64(6), object(78)
memory usage: 860.5 MB
#每种类型,占用内存情况
for dtype in ['float64','int64','object']:
    selected_dtype = g1.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024**2
    print('平均内存占用:',dtype,mean_usage_mb)
平均内存占用: float64 1.2947326073279748
平均内存占用: int64 1.1241934640066964
平均内存占用: object 9.514454069016855
#不同的int类型,最大值与最小值。
int_types=['uint8','int8','int16','int32','int64']
for it in int_types:
    print(np.iinfo(it))
Machine parameters for uint8
---------------------------------------------------------------
min = 0
max = 255
---------------------------------------------------------------

Machine parameters for int8
---------------------------------------------------------------
min = -128
max = 127
---------------------------------------------------------------

Machine parameters for int16
---------------------------------------------------------------
min = -32768
max = 32767
---------------------------------------------------------------

Machine parameters for int32
---------------------------------------------------------------
min = -2147483648
max = 2147483647
---------------------------------------------------------------

Machine parameters for int64
---------------------------------------------------------------
min = -9223372036854775808
max = 9223372036854775807
---------------------------------------------------------------
#计算内存占用量
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else:#Series类型
        usage_b = pandas_obj.memory_usage(deep=True)
    return '{:03.2f} MB'.format(usage_b/1024**2)

#减少内存占用
#思路:int64 -> int32
#取出g1中int64的列
g1_int = g1.select_dtypes(include=['int64'])
converted_int = g1_int.apply(pd.to_numeric,downcast='unsigned')
print('int原大小:',mem_usage(g1_int))
print('int转换后:',mem_usage(converted_int))
#减少内存占用,将float64 -> float32
g1_float = g1.select_dtypes(include=['float64'])
converted_float = g1_float.apply(pd.to_numeric,downcast='float')
print('float原大小:',mem_usage(g1_float))
print('float转换后:',mem_usage(converted_float))
int原大小: 7.87 MB
int转换后: 1.48 MB
float原大小: 100.99 MB
float转换后: 50.49 MB
#将数据重复较多的,转换为category类型
converted_obj = pd.DataFrame()
for col in g1_obj.columns:
    num_unique_values = len(g1_obj[col].unique())
    num_total_values = len(g1_obj[col])
    #若不重复的数据数量,占总数的50%以下,则说明重复数据较多,适合转换为category
    if num_unique_values/num_total_values < 0.5:
        converted_obj.loc[:,col] = g1_obj[col].astype('category')
    else:
        converted_obj.loc[:,col] = g1_obj[col]
print("原大小:",mem_usage(g1_obj))
print("转换前:",mem_usage(converted_obj))
原大小: 751.64 MB
转换前: 51.67 MB
#转换前后数据内存占用对比
optimized_g1 = g1.copy()
optimized_g1[converted_int.columns] = converted_int
optimized_g1[converted_float.columns] = converted_float
optimized_g1[converted_obj.columns] = converted_obj

print("转换前:",mem_usage(g1))
print("转换后:",mem_usage(optimized_g1))
转换前: 860.50 MB
转换后: 103.64 MB
#时间类型,若转换,可能造成内存占用更多
date = optimized_g1.date
print(date[:5])
#查看内存占用
print(mem_usage(optimized_g1['date']))
optimized_g1['date'] = pd.to_datetime(date,format='%Y%m%d')
print(mem_usage(optimized_g1['date']))
0    18710504
1    18710505
2    18710506
3    18710508
4    18710509
Name: date, dtype: uint32
0.66 MB
1.31 MB

猜你喜欢

转载自blog.csdn.net/xzy53719/article/details/82796057