python 比较两个文件相同记录并将相同记录写入文件中

  • 方法1
#比较两个文件相同记录,并将相同记录写入文件中
import time
time1 = time.time()
print(time1)
with open('sample_submission.csv') as file_object:
    lines_1 = file_object.readlines()
    file_line={}
    for line_1 in lines_1:
        line_1 = line_1.rstrip()        
        line_len1 = len(line_1)
        my_hash = 0

        for i in range(0,line_len1):
            my_hash = my_hash*33 + ord((line_1[i:i+1]))
            if my_hash < 0:
                my_hash = my_hash * (-1)

        file_line[my_hash]=line_1   


with open('sample_submission1.csv') as file_object1:
    with open('result.csv', 'w') as file_object2:
        lines_2 = file_object1.readlines()
        for line_2 in lines_2:
            line_2 = line_2.rstrip()
            line_len2 = len(line_2)
            hash_value = 0

            for i in range(0, line_len2):
                hash_value = hash_value*33 + ord((line_2[i:i+1]))
                if hash_value < 0:
                    hash_value = hash_value * (-1)

            if hash_value in file_line.keys():
                result_line = file_line.get(hash_value) + '\n'
                file_object2.write(result_line)


time2 = time.time()
print(time2)
print('比较两文件消耗时间为:' + str(time2-time1))

两文件记录在11万左右,相同记录7万左右
执行结果:
1527750785.3048437
1527750790.127326
比较两文件消耗时间为:4.822482347488403
1527750811.9845114
1527750816.800993
比较两文件消耗时间为:4.816481590270996
1527750834.5217652
1527750839.1572285
比较两文件消耗时间为:4.635463237762451

  • 方法2
#比较两个文件相同记录,并将相同记录写入文件中
import time
time1 = time.time()
print(time1)

def bin_qry(search_line, max_index, file_line):
    #二分查找算法
    start = 0
    end = max_index -1
    while start <= end:
        mid = int(start + (end-start)/2)
        if search_line == file_line[mid]:
            return 0
        elif search_line > file_line[mid]:
            start = mid + 1
        elif search_line < file_line[mid]:
            end = mid - 1
    return -1   


with open('sample_submission.csv') as file_object:
    lines_1 = file_object.readlines()
    file_line=[]

    for line_1 in lines_1:
        line_1 = line_1.rstrip()    
        file_line.append(line_1)

    file_line.sort()
    sum_line = len(file_line)

    with open('sample_submission1.csv') as file_object1:
        with open('result.csv', 'w') as file_object2:
            lines_2 = file_object1.readlines()
            for line_2 in lines_2:
                line_2 = line_2.rstrip()
                res = bin_qry(line_2, sum_line, file_line)
                if res == 0:
                    file_object2.write(line_2 + '\n')

time2 = time.time()
print(time2)
print(time2 - time1)

执行结果:
1527750971.1994314
1527750974.4837599
3.2843284606933594

1527750990.768388
1527750993.9247038
3.156315803527832

1527751010.618373
1527751013.8136923
3.195319414138794

猜你喜欢

转载自blog.csdn.net/codehouse/article/details/80524810
今日推荐