First, the scenario described

1, too many disk files, they could not bear to delete, simply want to clean out duplicate files, too laborious manual

2, download the finished product:

Baidu Cloud: https://pan.baidu.com/s/1W3pHU-dGi_mrd8M140Vogg
extraction code: ji0r

3, end-use:

(1) The repeat.exe placed to traverse the file to the folder;

(2) Double click repeat.exe ;

(3) automatically traversing all the current folder and sub-folder one by one to read the file "fingerprint" information and recording /RESULT/md5.his , identical "fingerprints" of a plurality of rows file path information will written in the form /RESULT/record.log ;

(4) retains most of the original file while the other "copy" to cut / RESULT / REPEATS / folder, ready for the user to choose to delete or not;

Depth traversal (5) file first search large volumes of files, so in the early traversal may be more "Caton";

(6) When faced with large files, traverse the situation takes too much time, you can close the program directly, the next time traverses through the file will not be repeated traversal, traversing improve efficiency and reduce wasted effort. Note: Do not delete the file /RESULT/md5.his .

Before running:

Observed / root /son.jpg and / root / sub /son.jpg mutually repeated

/ Root /grand.jpg and / root / promoter / Sun /grand.jpg mutually repeated

/ Root /root.jpg no repeat

(For the convenience of explanation and distinction, all copies of the files under the folder is not renamed stored in other files)

Runtime:

After running:

The results show that / root /son.jpg copy / root / sub /son.jpg

/ Root /grand.jpg copy / root / child / Sun /grand.jpg are moved to the / root / RESULT / REPEATS / folder

Users can choose whether to delete the copies of these files

Second, demand analysis

Repeat the same file might exist, the file name may be the same, may be different, they want to preserve the original file (the earliest creation time, file name, not "copy", non "xx (2)")

How to determine if two files of the same file? How to distinguish accurately identify the body as fingerprints?

Too many files, a single file is too large to traverse takes too much time, how to re-use the results of the last traversed when traversing in the secondary?

The lack of impact on the overall traversal how to avoid certain file permissions?

How to run the program cache garbage?

Third, code implementation

1、folder.py

# !/usr/bin/python3
# coding: utf-8
import os

import tool


def deep_list(path):
    if not os.path.isdir(path):
        return list()

    try:
        fs = os.listdir(path)
    except PermissionError:
        print("PermissionError:", path)
        return list()

    info = list()
    for f in fs:
        fp = tool.join(path, f)
        if os.path.isfile(fp):
            info.append(fp)
        elif os.path.isdir(fp):
            info.extend(deep_list(fp))
    return info

2、file.py

# !/usr/bin/python3
# coding: utf-8
import hashlib
import os
import traceback


def md5(path):
    if not os.path.isfile(path):
        return None

    try:
        hashes = hashlib.md5()
        f = open(path, "rb")
        while True:
            b = f.read(1024)
            if not b:
                break
            hashes.update(b)
        f.close()

        md = hashes.hexdigest()
        print("%s : %s" % (path, md))
        return md
    except:
        traceback.print_exc()
        return None


def name_order(path):
    if not os.path.exists(path):
        return None

    path = str(path).lower().strip()
    info = os.stat(path)
    create = info.st_ctime_ns
    if info.st_atime_ns < create:
        create = info.st_atime_ns
    if info.st_mtime_ns < create:
        create = info.st_mtime_ns

    suf = os.path.splitext(path)[1]
    basename = os.path.basename(path)
    name = basename.replace(suf, "").strip()
    name = name.replace("（", "(")
    name = name.replace("）", ")")

    layer = len(path.split("\\"))

    return "_".join((suf, str(create), name, str(layer)))

3、tool.py

# !/usr/bin/python3
# coding: utf-8


def join(path, *paths):
    path = fmt(path)

    for p in paths:
        p = fmt(p)
        path += "\\" + p

    path = fmt(path)
    return path


def fmt(path):
    if path is None:
        return ""

    path = path.strip()

    while path.find("/") >= 0:
        path = path.replace("/", "\\")
    while path.find("\\\\") >= 0:
        path = path.replace("\\\\", "\\")

    return path

4, mei.py

# !/usr/bin/python3
# coding: utf-8
import os
import re
import sys

import tool


def IS_MEI(basename):
    return re.match("^_MEI\d+$", basename) and True or False


def remove():
    for index, path in enumerate(sys.path):
        basename = os.path.basename(path)
        if not IS_MEI(basename):
            continue

        drive = os.path.splitdrive(path)[0]
        if "" == drive:
            path = tool.join(os.getcwd(), path)

        if os.path.isdir(path):
            try:
                print("remove", path)
                os.remove(path)
            finally:
                break

5, repeat.py (core)

# !/usr/bin/python3
# coding: utf-8
import gc
import os
import sys
import time
import traceback

this = os.path.abspath(os.path.dirname(__file__))
module = os.path.split(this)[0]
sys.path.append(module)
for i, val in enumerate(sys.path):
    print("[%s] %s" % (i + 1, val))

import file
import folder
import tool
import mei

SEGMENTER = ">>"

FOLDER_FOR_RESULT = "RESULT"
FOLDER_FOR_REPEATS = "REPEATS"

FILE_MD5_HIS = "md5.his"
FILE_MD5_TMP = "md5.tmp"
FILE_RECORD_LOG = "record.log"


def read_md5_his(cwd):
    his_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_HIS)
    if not os.path.exists(his_path):
        return dict()

    with open(his_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    fpMd5 = dict()
    for line in lines:
        row = line.split(SEGMENTER)
        fp = row[0].strip()
        md5 = row[1].strip()
        if os.path.exists(fp) and len(md5) > 1:
            fpMd5[fp] = md5
    return fpMd5


def reverse_file_md5(fileMd5):
    ls = list()
    for fp, md5 in fileMd5.items():
        ls.append((fp, md5))
    ls.sort(key=lambda ele: ele[1])

    md5Fp = dict()
    for kv in ls:
        fp = kv[0]
        md5 = kv[1]

        if md5 not in md5Fp:
            fps = set()
        else:
            fps = md5Fp[md5]
        fps.add(fp)
        md5Fp[md5] = fps

    return md5Fp


def scan_folder(cwd):
    # step0: listdir
    fps = folder.deep_list(cwd)
    if len(fps) < 2:  # must have it self
        print("No FILE IN", cwd)
        return

    # step1: remove self and root_folder
    self = os.path.abspath(sys.executable)
    print("SELF IS", self)
    if self in fps:
        fps.remove(self)

    result_path = tool.join(cwd, FOLDER_FOR_RESULT)
    for index in range(len(fps) - 1, -1, -1):
        fp = fps[index]
        if str(fp).startswith(result_path):
            fps.remove(fp)

    if len(fps) < 1:
        print("No FILE IN", cwd)
        return

    # step2: read history and write to tmp
    tmp_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_TMP)
    fpMd5 = read_md5_his(cwd)
    if len(fpMd5) > 0:
        rows = ""
        for p, md5 in fpMd5.items():
            rows += p + SEGMENTER + md5 + "\n"
        with open(tmp_path, 'w', encoding='utf-8') as f:
            f.write(rows)

    print()

    # step3: makedir
    repeats_path = tool.join(cwd, FOLDER_FOR_RESULT, FOLDER_FOR_REPEATS)
    if not os.path.exists(repeats_path):
        os.makedirs(repeats_path)

    fps = list(fps)
    fps.sort(key=lambda fp: os.stat(fp).st_size, reverse=True)

    # step4: read file's md5 and append to tmp
    tmpMd5 = dict()
    for index in range(len(fps)):
        fp = fps[index]
        if fp in fpMd5:
            continue

        md5 = file.md5(fp)
        if md5 is not None:
            tmpMd5[fp] = md5
            fpMd5[fp] = md5

        if 10 == len(tmpMd5) or (index == len(fps) - 1):
            rows = ""
            for p, md in tmpMd5.items():
                rows += p + SEGMENTER + md + "\n"
            with open(tmp_path, 'a', encoding='utf-8') as f:
                f.write(rows)
            tmpMd5.clear()

    print()

    # step5: remove the repeat
    content = ""
    md5Fp = reverse_file_md5(fpMd5)
    for md5, fps in md5Fp.items():
        if len(fps) < 2:
            continue

        fps = list(fps)
        fps.sort(key=lambda fp: file.name_order(fp))

        print("%s : %s" % (md5, ','.join(fps)))
        content += ','.join(fps) + "\n"

        for i in range(len(fps)):
            if i == 0:
                # not move the first
                continue

            old = fps[i]
            fn = os.path.basename(old)
            new = tool.join(repeats_path, fn)
            if old == new:
                continue

            # if the new is exist
            if os.path.exists(new):
                try:
                    os.remove(new)
                except:
                    traceback.print_exc()

            # move the repeats to REPEAT/FILES folder
            try:
                os.rename(old, new)
            except:
                traceback.print_exc()

    # step6: record the repeat
    if "" == content:
        print("No REPEAT FILE EXISTS")
    else:
        this_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

        record = tool.join(cwd, FOLDER_FOR_RESULT, FILE_RECORD_LOG)
        with open(record, 'a', encoding='utf-8') as f:
            f.write("\n\n" + this_time + "\n" + content)


def save_file_md5(cwd):
    # step7: update md5 history record file
    try:
        his_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_HIS)
        tmp_path = tool.join(cwd, FOLDER_FOR_RESULT, FILE_MD5_TMP)

        if os.path.exists(tmp_path):
            if os.path.exists(his_path):
                os.remove(his_path)
            os.rename(tmp_path, his_path)
    except:
        traceback.print_exc()


if __name__ == '__main__':
    try:
        cwd = os.getcwd()
        print("\nCURRENT PATH IS %s\n" % cwd)

        scan_folder(cwd)
    except:
        traceback.print_exc()
    finally:
        save_file_md5(cwd)

        gc.collect()
        input("\nPRESS ANY KEYS TO EXIT\n")
        mei.remove()

Fourth, packaged exe

pyinstaller -F repeat.py

The department does not make Detailed, Details, refer to:

"Pyinstaller packaged lessons learned"

"Pyinstaller packaged exe full temp"

Python implementation of the clean up duplicate files