from __future__ import print_function
import fnmatch
import os
import hashlib
import sys
CHUNM_SIZE = 8192
def is_file_match(f, patterns):
for pattern in patterns:
if fnmatch.fnmatch(f, pattern):
return True
return False
def find_specific_files(dir, pattern=["*"], exclude_dir=[]):
for root, dirnames, filenames in os.walk(dir):
for filename in filenames:
full_path = os.path.join(root, filename)
if is_file_match(full_path, pattern):
yield full_path
for directory in exclude_dir:
if directory in exclude_dir:
exclude_dir.remove(directory)
def get_chunk(file):
with open(file,'rb') as fb:
while True:
chunk = fb.read(CHUNM_SIZE)
if not chunk:
break
else:
yield chunk
def get_md5_sum(file):
md5_obj = hashlib.md5()
for chunk in get_chunk(file):
md5_obj.update(chunk)
return md5_obj.hexdigest()
def main():
dir_for_search = sys.argv[1]
if not os.path.isdir(dir_for_search):
raise SystemExit("{dir} is not a directories.".format(dir=dir_for_search))
record = {}
for file in find_specific_files(dir_for_search):
md5_sum = get_md5_sum(file)
if md5_sum in record:
print("find duplicated file {0} vs {1}".format(record.get(md5_sum), file))
else:
record[md5_sum] = file
if __name__ == '__main__':
sys.argv.append(r'F:\python')
main()
python查找指定目录下所有内容相同的文件
猜你喜欢
转载自blog.csdn.net/a200822146085/article/details/89296854
今日推荐
周排行