Python 文件夹内容比较

标签：Python

参考网站

官方文档
第三方文档
安装目录下的Doc文件夹内有一个文档
python 命令行模式下先 import 对应的模块，再输入 help(xxx) dir(xxx) 询问。
其中 help 模式下可以用空格上下页翻页，q键退出。

传入参数

写一个程序比较两个文件里的内容

import sys
from sys import argv    // argv 类似于 C 里的传入参数
if len(argv) < 3:
    print(argv[0], " <dir1> <dir2>")
    sys.exit()
print(argv)
======================================================================================
['D:\\Code\\ML\\hello_python_source\\chapter 03\\my.py', 'test', 'test2']

sys.exit()

为了避免出错，有些条件不满足即可调用 exit() 退出程序。

argv

可以获取传入参数，以 list 的形式存储。

文件目录操作

import os
for directory in argv[1:]:
    count = 1
    if not os.access(directory, os.F_OK):
        print("this directory doesn't exit!")
        sys.exit()
    print("Open ", directory)
    for item in os.walk(directory):
        print("item ", count, item)
        count += 1
    print()
=======================================================================================
Open  test
item  1 ('test', ['dir1', 'dir3', 'test 2'], ['image1.gif', 'image2.gif', 'image3.gif', 'test1.txt', 'test2.txt', 'test3.txt'])
item  2 ('test\\dir1', [], [])
item  3 ('test\\dir3', [], [])
item  4 ('test\\test 2', [], ['test1.txt', 'test2.txt', 'test3.txt'])

Open  test2
item  1 ('test2', ['dir3', 'dir4', 'test 2'], ['image2.gif', 'image3.gif', 'image4.gif', 'test2.txt', 'test3.txt', 'test4.txt'])
item  2 ('test2\\dir3', [], [])
item  3 ('test2\\dir4', [], [])
item  4 ('test2\\test 2', [], ['test2.txt', 'test3.txt', 'test4.txt'])

元组

以括号的形式给出，上面每一个 item 都是一个元组。

os.walk()

以树状方式遍历一个目录下所有的文件，并给出一个包含本目录名、子目录名、子文件名在内的三元元组。
每个元组内都可能有多个对象，用list给出。

('本目录名', ['子目录1', '子目录2', '子目录3'], ['子文件1', '子文件2'])

os.listdir()

会对文件里的每个第一层对象，包括子目录和子文件，进行枚举。但是不会像walk一样走遍所有层的文件。会有清晰的表示。

更清晰的文件目录预览

# tree directory
def prDire(filepath, item, space):
    for item in os.listdir(filepath):
        tpath = os.path.join(filepath, item)
        print(' '*(space-1)*2 + '|' + '-' + item)
        if os.path.isdir(tpath):
            prDire(tpath, item, space+1)

import sys
from sys import argv
if len(argv) < 3:
    print(argv[0], " <dir1> <dir2>")
    sys.exit()
print(argv)

import os
for directory in argv[1:]:
    count = 1
    if not os.access(directory, os.F_OK):
        print("this directory doesn't exit!")
        sys.exit()
    print("Open ", directory)
    prDire(directory, directory, 1)
    print()
=========================================================
...
Open  D:\Code\ML\hello_python_source
|-chapter 02
  |-my.py
  |-wumpus-1.py
  |-wumpus-2.py
  |-wumpus-3.py
  |-wumpus-4-functions.py
  |-wumpus-5-arrows.py
  |-wumpus-6-names.py
  |-wumpus-friend.py
|-chapter 03
  |-difference_engine_1_sys.py
  |-difference_engine_2_os.py
  |-difference_engine_3_file.py
  |-difference_engine_4_puttogether.py
  |-difference_engine_5_inorder.py
  |-difference_engine_6_directories.py
  |-my.py
  |-test
    |-dir1
    |-dir3
    |-image1.gif
    |-image2.gif
    |-image3.gif
    |-test 2
      |-test1.txt
      |-test2.txt
      |-test3.txt
    |-test1.txt
    |-test2.txt
    |-test3.txt
  |-test2
    |-dir3
    |-dir4
    |-image2.gif
    |-image3.gif
    |-image4.gif
    |-test 2
      |-test2.txt
      |-test3.txt
      |-test4.txt
    |-test2.txt
    |-test3.txt
    |-test4.txt
|-chapter 04
  |-test_todo.py
  |-todo.py
...

递归调用了打印函数，并且要记录对象的路径。

os.path.join(,)

将路径与文件名连接起来组成新的路径。由于在不同操作系统中连接符会有所区别，join函数可自适应调整连接符。

文件读写

# file O/I
import sys
import os
from sys import argv
if len(argv) < 3:
    print(argv[0], " <dir1> <dir2>")
    sys.exit()
print(argv)

filepath = os.path.join(argv[1], "ttt.txt")
if not os.access(filepath, os.F_OK):
    print(filepath, " doesn't exit!")
    sys.exit()
print(filepath)

nfile = open(filepath, "r+")
print(nfile.read())
nfile.close()

nfile = open(filepath, "r+")
print(nfile.readlines())
# nfile.close()

# nfile = open(filepath, "r+")
nfile.write("new words\n")
nfile.writelines(['what', ' ', 'yes'])
nfile.close()

nfile = open(filepath, "r+")
print(nfile.read())
nfile.close()
==========================================================================
['D:\\Code\\ML\\hello_python_source\\chapter 03\\mye.py', 'test', 'test2']
test\ttt.txt
123 456 789
abc def
['123 456 789\n', 'abc def']
123 456 789
abc defnew words
what yes

open(,) close()

提供路径和参数，可以将文件指针传递给nfile。
参数含义见runoob
r+ 打开一个文件用于读写。文件指针将会放在文件的开头
因此直接写会导致前面的内容被覆盖。因此可以先read再write，指针会放在结束位置。
a+ 打开一个文件用于读写。如果该文件已存在，文件指针将会放在文件的结尾。文件打开时会是追加模式。如果该文件不存在，创建新文件用于读写。
因此 write 的打开参数设置成这样就不用担心覆盖了。

文件写完以后及时关闭，才能将缓存区的内容保存到文件中。

read() readlines()

前者读入整个文件作为一个字符串；后者将每行都分开存储成list

//read()
123 456 789
abc def
//readlines()
['123 456 789\n', 'abc def']

文件比较

# md5
from hashlib import md5
import sys
import os
from sys import argv
if len(argv) < 3:
    print(argv[0], " <dir1> <dir2>")
    sys.exit()
print(argv)

filepath = os.path.join(argv[1], "image1.gif")
if not os.access(filepath, os.F_OK):
    print(filepath, " doesn't exit!")
    sys.exit()
print(filepath)

nfile = open(filepath, "r+")
text = nfile.readlines()
print(text)

ans = md5()
for x in text:
    ans.update(x.encode("utf-8"))
    print(ans.hexdigest())

nfile.close()

md5()

可以将文件加密为128位二进制码，32位十六进制码，加密原理可参见科普页漫画：什么是 MD5 算法？
逐行读入文件，更新到输出的二进制加密码中，注意到 update() 函数的参数必须是二进制的。可以编码到 utf-8 再处理。也可以一开始就用二进制的形式打开。则 readlines() 都会是二进制码。

nfile = open(filepath, "rb")

两个文件夹内容比较

# file compare
# both 1 and 2, but differ (diff) false
# only 1 (+) true
# only 2 (+) true
import sys
import os
from sys import argv
from hashlib import md5

def prDire(filepath, item, space, file_list):
    if space == 1:
        print('\n', 'open', filepath)
    for item in os.listdir(filepath):
        tpath = os.path.join(filepath, item)
        flag = ' '
        if tpath in file_list.keys():
            if file_list[tpath] == True:
                flag = '(+)'
            else:
                flag = '(diff)'
        print(' '*(space-1)*2 + '|' + '-' + item + flag)
        if os.path.isdir(tpath):
            prDire(tpath, item, space+1, file_list)

def md5_file(filepath):
    if os.path.isdir(filepath):
        return 0
    ret = md5()
    file = open(filepath, "rb")
    for x in file.readlines():
        ret.update(x)
    return ret.hexdigest()

def listingDirectory(directory):
    if not os.access(directory, os.F_OK):
        print("this directory doesn't exit!")
        sys.exit()
    file_list = {}
    for path, dirs, files in os.walk(directory):
        for each in dirs:
            file_list[os.path.join(path, each)] = False;
        for each in files:
            file_list[os.path.join(path, each)] = False;
    return file_list

def singleOwner(i, j):
    for head in list(file_list[i].keys()):
        each = argv[j] + head[len(argv[i]):]
        if each not in file_list[j]:
            file_list[i][head] = True

if len(argv) != 3:
    print(argv[0], " <dir1> <dir2>")
    sys.exit()

file_list = [{}, {}, {}]
file_list[1] = listingDirectory(argv[1])
file_list[2] = listingDirectory(argv[2])

singleOwner(1, 2)
singleOwner(2, 1)

for head in list(file_list[1].keys()):
    if file_list[1][head] == False:
        each = argv[2] + head[len(argv[1]):]
        if md5_file(head) == md5_file(each):
            del file_list[1][head]
            del file_list[2][each]

prDire(argv[1], argv[1], 1, file_list[1])
prDire(argv[2], argv[2], 1, file_list[2])
================================================================

 open test
|-dir1(+)
|-dir3
|-image1.gif(+)
|-image2.gif(diff)
|-image3.gif
|-test 2
  |-test1.txt(+)
  |-test2.txt(diff)
  |-test3.txt
  |-timg.jpg
|-test1.txt(+)
|-test2.txt(diff)
|-test3.txt
|-ttt.txt(+)

 open test2
|-dir3
|-dir4(+)
|-image2.gif(diff)
|-image3.gif
|-image4.gif(+)
|-test 2
  |-test2.txt(diff)
  |-test3.txt
  |-test4.txt(+)
  |-timg.jpg
|-test2.txt(diff)
|-test3.txt
|-test4.txt(+)

list()

将字典转化到 list 再进行拼接，否则会报错

for head in file_list[i].keys(): ....
==========================================================
RuntimeError: dictionary changed size during iteration
==========================================================
->   for head in list(file_list[i].keys()):