超大文件排序,主要步骤为:切割成小文件,小文件各自排序,归并为大排序文件。
一、pandas切割超大文件:
import time
import pandas as pd
from tqdm import tqdm
i = 0
def reader_pandas(file, sep='\t', chunkSize=5000000, patitions=21, header=None):
#file:文件名;
#sep:读入时按此分隔符分割
#chunkSize:切割后每个小文件的大小
#patitions:进度条大小
reader = pd.read_csv(file, iterator=True)
chunks = []
i = 0
with tqdm(range(patitions), 'Reading ...') as t:
for _ in t:
try:
chunk = reader.get_chunk(chunkSize)
i += 1
chunk.to_csv('sorted' + str(i) + '.csv', index=False, header=None)
# chunks.append(chunk)
except StopIteration:
break
# return pd.concat(chunks, ignore_index=True)
#也可用此方法将大文件分块读入内存后合并,比直接读入内存快。
reader_pandas("file.csv")
二、pandas小文件按关键字排序:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
for i in range(1, 2):
start_time = time.time()
try:
lc = pd.read_csv('datasets\operation&static'+str(i)+'.csv', sep='\t', header=None)
print('chunk'+str(i))
except Exception:
raise Exception('file is not found!')
# lc=pd.DataFrame(raw_data)
#按4,5,6,1列的顺序排序,去重,写入
lc = lc.sort_values(by=[4,5,6,1],ascending=[True,True,True,True])
lc = lc.drop_duplicates()
lc.to_csv('sort21/sort' + str(i) + '.csv', index=False, header=None, sep='\t')
end_time = time.time()
print(end_time - start_time)
三、归并为大的排序文件:
import time
import pandas as pd
from tqdm import tqdm
import os
def getKey(l): #在一行数据中定义排序关键字,本文为按照第4,5,6,1列值得大小顺序排列
l = l.split("\t")
return int(l[4]), int(l[5]), int(l[6]), int(l[1])
def mergeFiles(F):
os.path.exists(F)
files = os.listdir(F)
fs = []
for f in files:
fs.append(open('sort21/' + f))
lines = []
keys = []
for f in fs:
l = f.readline()
lines.append(l)
keys.append(getKey(l))
buff = []
buffSize = 2
append = buff.append
output = open('sort2.csv','w')
try:
key = min(keys)
index = keys.index(key)
get_key = getKey
while 1:
while key == min(keys):
append(lines[index])
if len(buff) > buffSize:
output.write(''.join(buff))
del buff[:]
line = fs[index].readline()
if not line:
fs[index].close()
del fs[index]
del keys[index]
del lines[index]
break
key = get_key(line)
keys[index] = key
lines[index] = line
if len(fs)==0:
break
# key != min(keys), see for new index (file)
key = min(keys)
index = keys.index(key)
if len(buff)>0:
output.write(''.join(buff))
finally:
output.close()
start_time = time.time()
mergeFiles('files')
end_time = time.time()
print(end_time-start_time)