First, the scenario described
1, too many disk files, they could not bear to delete, simply want to clean out duplicate files, too laborious manual
2, download the finished product:
Baidu Cloud: https://pan.baidu.com/s/1W3pHU-dGi_mrd8M140Vogg
extraction code: ji0r
3, end-use:
(1) The repeat.exe placed to traverse the file to the folder;
(2) Double click repeat.exe ;
(3) automatically traversing all the current folder and sub-folder one by one to read the file "fingerprint" information and recording /RESULT/md5.his , identical "fingerprints" of a plurality of rows file path information will written in the form /RESULT/record.log ;
(4) retains most of the original file while the other "copy" to cut / RESULT / REPEATS / folder, ready for the user to choose to delete or not;
Depth traversal (5) file first search large volumes of files, so in the early traversal may be more "Caton";
(6) When faced with large files, traverse the situation takes too much time, you can close the program directly, the next time traverses through the file will not be repeated traversal, traversing improve efficiency and reduce wasted effort. Note: Do not delete the file /RESULT/md5.his .
Before running:
Observed / root /son.jpg and / root / sub /son.jpg mutually repeated
/ Root /grand.jpg and / root / promoter / Sun /grand.jpg mutually repeated
/ Root /root.jpg no repeat
(For the convenience of explanation and distinction, all copies of the files under the folder is not renamed stored in other files)
Runtime:
After running:
The results show that / root /son.jpg copy / root / sub /son.jpg
/ Root /grand.jpg copy / root / child / Sun /grand.jpg are moved to the / root / RESULT / REPEATS / folder
Users can choose whether to delete the copies of these files
Second, demand analysis
Repeat the same file might exist, the file name may be the same, may be different, they want to preserve the original file (the earliest creation time, file name, not "copy", non "xx (2)")
How to determine if two files of the same file? How to distinguish accurately identify the body as fingerprints?
Too many files, a single file is too large to traverse takes too much time, how to re-use the results of the last traversed when traversing in the secondary?
The lack of impact on the overall traversal how to avoid certain file permissions?
How to run the program cache garbage?
Third, code implementation
1、folder.py
# !/usr/bin/python3
# coding: utf-8
import os
import tool
def deep_list(path):
if not os.path.isdir(path):
return list()
try:
fs = os.listdir(path)
except PermissionError:
print("PermissionError:", path)
return list()
info = list()
for f in fs:
fp = tool.join(path, f)
if os.path.isfile(fp):
info.append(fp)
elif os.path.isdir(fp):
info.extend(deep_list(fp))
return info
2、file.py
# !/usr/bin/python3
# coding: utf-8
import hashlib
import os
import traceback
def md5(path):
if not os.path.isfile(path):
return None
try:
hashes = hashlib.md5()
f = open(path, "rb")
while True:
b = f.read(1024)
if not b:
break
hashes.update(b)
f.close()
md = hashes.hexdigest()
print("%s : %s" % (path, md))
return md
except:
traceback.print_exc()
return None
def name_order(path):
if not os.path.exists(path):
return None
path = str(path).lower().strip()
info = os.stat(path)
create = info.st_ctime_ns
if info.st_atime_ns < create:
create = info.st_atime_ns
if info.st_mtime_ns < create:
create = info.st_mtime_ns
suf = os.path.splitext(path)[1]
basename = os.path.basename(path)
name = basename.replace(suf, "").strip()
name = name.replace("(", "(")
name = name.replace(")", ")")
layer = len(path.split("\\"))
return "_".join((suf, str(create), name, str(layer)))
3、tool.py
# !/usr/bin/python3
# coding: utf-8
def join(path, *paths):
path = fmt(path)
for p in paths:
p = fmt(p)
path += "\\" + p
path = fmt(path)
return path
def fmt(path):
if path is None:
return ""
path = path.strip()
while path.find("/") >= 0:
path = path.replace("/", "\\")
while path.find("\\\\") >= 0:
path = path.replace("\\\\", "\\")
return path
4, mei.py
# !/usr/bin/python3
# coding: utf-8
import os
import re
import sys
import tool
def IS_MEI(basename):
return re.match("^_MEI\d+$", basename) and True or False
def remove():
for index, path in enumerate(sys.path):
basename = os.path.basename(path)
if not IS_MEI(basename):
continue
drive = os.path.splitdrive(path)[0]
if "" == drive:
path = tool.join(os.getcwd(), path)
if os.path.isdir(path):
try:
print("remove", path)
os.remove(path)
finally:
break
5, repeat.py (core)
# !/usr/bin/python3
# coding: utf-8
import gc
import os
import sys
import time
import traceback
this = os.path.abspath(os.path.dirname(__file__))
module = os.path.split(this)[0]
sys.path.append(module)
for i, val in enumerate(sys.path):
print("[%s] %s" % (i + 1, val))
import file
import folder
import tool
import mei
SEGMENTER = ">>"
FOLDER_FOR_RESULT = "RESULT"
FOLDER_FOR_REPEATS = "REPEATS"
FILE_MD5_HIS = "md5.his"
FILE_MD5_TMP = "md5.tmp"
FILE_RECORD_LOG = "record.log"
def read_md5_his(cwd):
his_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_HIS)
if not os.path.exists(his_path):
return dict()
with open(his_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
fpMd5 = dict()
for line in lines:
row = line.split(SEGMENTER)
fp = row[0].strip()
md5 = row[1].strip()
if os.path.exists(fp) and len(md5) > 1:
fpMd5[fp] = md5
return fpMd5
def reverse_file_md5(fileMd5):
ls = list()
for fp, md5 in fileMd5.items():
ls.append((fp, md5))
ls.sort(key=lambda ele: ele[1])
md5Fp = dict()
for kv in ls:
fp = kv[0]
md5 = kv[1]
if md5 not in md5Fp:
fps = set()
else:
fps = md5Fp[md5]
fps.add(fp)
md5Fp[md5] = fps
return md5Fp
def scan_folder(cwd):
# step0: listdir
fps = folder.deep_list(cwd)
if len(fps) < 2: # must have it self
print("No FILE IN", cwd)
return
# step1: remove self and root_folder
self = os.path.abspath(sys.executable)
print("SELF IS", self)
if self in fps:
fps.remove(self)
result_path = tool.join(cwd, FOLDER_FOR_RESULT)
for index in range(len(fps) - 1, -1, -1):
fp = fps[index]
if str(fp).startswith(result_path):
fps.remove(fp)
if len(fps) < 1:
print("No FILE IN", cwd)
return
# step2: read history and write to tmp
tmp_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_TMP)
fpMd5 = read_md5_his(cwd)
if len(fpMd5) > 0:
rows = ""
for p, md5 in fpMd5.items():
rows += p + SEGMENTER + md5 + "\n"
with open(tmp_path, 'w', encoding='utf-8') as f:
f.write(rows)
print()
# step3: makedir
repeats_path = tool.join(cwd, FOLDER_FOR_RESULT, FOLDER_FOR_REPEATS)
if not os.path.exists(repeats_path):
os.makedirs(repeats_path)
fps = list(fps)
fps.sort(key=lambda fp: os.stat(fp).st_size, reverse=True)
# step4: read file's md5 and append to tmp
tmpMd5 = dict()
for index in range(len(fps)):
fp = fps[index]
if fp in fpMd5:
continue
md5 = file.md5(fp)
if md5 is not None:
tmpMd5[fp] = md5
fpMd5[fp] = md5
if 10 == len(tmpMd5) or (index == len(fps) - 1):
rows = ""
for p, md in tmpMd5.items():
rows += p + SEGMENTER + md + "\n"
with open(tmp_path, 'a', encoding='utf-8') as f:
f.write(rows)
tmpMd5.clear()
print()
# step5: remove the repeat
content = ""
md5Fp = reverse_file_md5(fpMd5)
for md5, fps in md5Fp.items():
if len(fps) < 2:
continue
fps = list(fps)
fps.sort(key=lambda fp: file.name_order(fp))
print("%s : %s" % (md5, ','.join(fps)))
content += ','.join(fps) + "\n"
for i in range(len(fps)):
if i == 0:
# not move the first
continue
old = fps[i]
fn = os.path.basename(old)
new = tool.join(repeats_path, fn)
if old == new:
continue
# if the new is exist
if os.path.exists(new):
try:
os.remove(new)
except:
traceback.print_exc()
# move the repeats to REPEAT/FILES folder
try:
os.rename(old, new)
except:
traceback.print_exc()
# step6: record the repeat
if "" == content:
print("No REPEAT FILE EXISTS")
else:
this_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
record = tool.join(cwd, FOLDER_FOR_RESULT, FILE_RECORD_LOG)
with open(record, 'a', encoding='utf-8') as f:
f.write("\n\n" + this_time + "\n" + content)
def save_file_md5(cwd):
# step7: update md5 history record file
try:
his_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_HIS)
tmp_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_TMP)
if os.path.exists(tmp_path):
if os.path.exists(his_path):
os.remove(his_path)
os.rename(tmp_path, his_path)
except:
traceback.print_exc()
if __name__ == '__main__':
try:
cwd = os.getcwd()
print("\nCURRENT PATH IS %s\n" % cwd)
scan_folder(cwd)
except:
traceback.print_exc()
finally:
save_file_md5(cwd)
gc.collect()
input("\nPRESS ANY KEYS TO EXIT\n")
mei.remove()
Fourth, packaged exe
pyinstaller -F repeat.py
The department does not make Detailed, Details, refer to: