Python实现Huffman编码

基于Huffman编码的压缩软件的Python实现

个人分类: 算法 Python

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/xanxus46/article/details/41359841

哈夫曼编码是利用贪心算法进行文本压缩的算法,其算法思想是首先统计文件中各字符出现的次数,保存到数组中,然后将各字符按照次数升序排序,挑选次数最小的两个元素进行连结形成子树,子树的次数等于两节点的次数之和,接着把两个元素从数组删除,将子树放入数组,重新排序,重复以上步骤。为了解压,在压缩时首先往文件中填入huffman编码的映射表的长度,该表的序列化字符串,编码字符串分组后最后一组的长度(编码后字符串长度模上分组长度),最后再填充编码后的字符串。本算法中以一个字节,8位作为分组长度,将编码后二进制字符串一一分组。代码如下:


__author__ = 'linfuyuan'

import struct

import pickle

type = int(raw_input('please input the type number(0 for compress, 1 for decompress):'))

file = raw_input('please input the filepath:')

class Node:

    def __init__(self):

        self.value = ''

        self.left = None

        self.right = None

        self.frequency = 0

        self.code = ''

# let the unique value be the key in the map

def change_value_to_key(huffmap):

    map = {}

    for (key, value) in huffmap.items():

        map[value] = key

    return map

if type == 0:

    origindata = ''

    # count the frequency of each letter

    lettermap = {}

    def give_code(node):

        if node.left:

            node.left.code = '%s%s' % (node.code, '0')

            give_code(node.left)

        if node.right:

            node.right.code = '%s%s' % (node.code, '1')

            give_code(node.right)

    def print_code(node):

        if not node.left and not node.right:

            print "%s %s" % (node.value, node.code)

        if node.left:

            print_code(node.left)

        if node.right:

            print_code(node.right)

    def save_code(map, node):

        if not node.left and not node.right:

            map[node.value] = node.code

        if node.left:

            save_code(map, node.left)

        if node.right:

            save_code(map, node.right)

    with open(file)as f:

        for line in f.readlines():

            origindata += line

            for j in line:

                if lettermap.get(j):

                    lettermap[j] += 1

                else:

                    lettermap[j] = 1

    nodelist = []

    for (key, value) in lettermap.items():

        node = Node()

        node.value = key

        node.frequency = value

        nodelist.append(node)

    nodelist.sort(cmp=lambda n1, n2: cmp(n1.frequency, n2.frequency))

    for i in range(len(nodelist) - 1):

        node1 = nodelist[0]

        node2 = nodelist[1]

        node = Node()

        node.left = node1

        node.right = node2

        node.frequency = node1.frequency + node2.frequency

        nodelist[0] = node

        nodelist.pop(1)

        nodelist.sort(cmp=lambda n1, n2: cmp(n1.frequency, n2.frequency))

    # give the code

    root = nodelist[0]

    give_code(root)

    huffman_map = {}

    # save the node code to a map

    save_code(huffman_map, root)

    code_data = ''

    for letter in origindata:

        code_data += huffman_map[letter]

    output_data = ''

    f = open('%s_compress' % file, 'wb')

    huffman_map_bytes = pickle.dumps(huffman_map)

    f.write(struct.pack('I', len(huffman_map_bytes)))

    f.write(struct.pack('%ds' % len(huffman_map_bytes), huffman_map_bytes))

    f.write(struct.pack('B', len(code_data) % 8))

    for i in range(0, len(code_data), 8):

        if i + 8 < len(code_data):

            f.write(struct.pack('B', int(code_data[i:i + 8], 2)))

        else:

            # padding

            f.write(struct.pack('B', int(code_data[i:], 2)))

    f.close()

    print 'finished compressing'

if type == 1:

    f = open(file, 'rb')

    size = struct.unpack('I', f.read(4))[0]

    huffman_map = pickle.loads(f.read(size))

    left = struct.unpack('B', f.read(1))[0]

    data = f.read(1)

    datalist = []

    while not data == '':

        bdata = bin(struct.unpack('B', data)[0])[2:]

        datalist.append(bdata)

        data = f.read(1)

    f.close()

    for i in range(len(datalist) - 1):

        datalist[i] = '%s%s' % ('0' * (8 - len(datalist[i])), datalist[i])

    datalist[-1] = '%s%s' % ('0' * (left - len(datalist[-1])), datalist[-1])

    encode_data = ''.join(datalist)

    current_code = ''

    huffman_map = change_value_to_key(huffman_map)

    f = open('%s_origin' % file, 'w')

    for letter in encode_data:

        current_code += letter

        if huffman_map.get(current_code):

            f.write(huffman_map[current_code])

            current_code = ''

    f.close()

    print 'finished decompressing'

raw_input('please press any key to quit')

代码中有用到pickle模块进行对象序列化,还有struct模块进行读写二进制文件。

由于算法中运算量最⼤的地⽅在于循环⾥嵌套了排序,故算法的时间复杂度是O(n2logn)。

经过压缩后,文件大⼩小分别为110KB和931KB。原来⼤⼩为190KB和 2.1MB,压缩效果明显。

希望对大家有用。

猜你喜欢

转载自blog.csdn.net/tiankongtiankong01/article/details/83038155