python哈夫曼压缩与解压算法

版权声明:本文为博主原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。
本文链接: https://blog.csdn.net/klli15/article/details/100522240

python哈夫曼压缩与解压算法

压缩

#encoding: utf-8
from bitarray import bitarray
import random
import json

class Node(object):
	"""docstring for Node"""
	left=None
	right=None
	times=0
	char = ''
	parent=None
	def __init__(self):
		super(Node, self).__init__()
# 获取带有权重的字符 的字典
def countTimes(str):
	times = {}
	for i in range(0,len(str)):
		if (not str[i] in times):
			times[str[i]]=1
		else:
			times[str[i]]+=1
	return sorted(times.items(),key = lambda x:x[1])

# 将带有次数以及字符的数据转换成一颗树
def getTree(data):
	temp=data[:]
	head= None  #拿到 head 就相当于拿到了整棵树
	nodes = []
	# 都变成 Node
	for x in temp:
		tempNode = Node()
		tempNode.char = x[0]
		tempNode.times = x[1]
		nodes.append(tempNode)
	while len(nodes)>=2:
		# 需要把nodes都排序好,然后再操作第1,2个,因为是他们最小
		nodes = sorted(nodes,key= lambda x:x.times)
		nums = nodes[0].times + nodes[1].times
		chars = nodes[0].char + nodes[1].char
		node_parent =Node()

		# print("len(temp):%d\tnums:%d\tchars:%s"%(len(temp),nums,chars))
		node_parent.times = nums
		node_parent.left = nodes[0]
		node_parent.right = nodes[1]
		node_parent.char =chars
		head = node_parent
		del(nodes[0:2]) # 删除第1,2个
		nodes.append(node_parent) # 加入本轮1,2个的合体,参与下一轮的循环
	return head
# 哈夫曼编码,使用递归。注意:只含叶子节点 返回 字符-哈夫曼编码  字典
def encode(head,code=""):
	head_bak = head
	now =head
	global result # python 要声明global这个才可以 对全局变量进行写操作
	if now.left != None:
		encode(now.left,code+"0")
	if now.right != None:
		encode(now.right,code+"1")
	if now.right == None and now.left == None:
		result[now.char] = code
		
def printTree(head):
	temp = head
	if temp.left != None:
		printTree(temp.left)
	if temp.right != None:
		printTree(temp.right)
	print("char:%s\ttimes:%d"%(temp.char,temp.times))

# 可以将 字符权重 字典变为bitarray 以便写入二进制压缩文件
def dict2bits(dictObject,endian="little"):
	str = json.dumps(dictObject)
	bits = bitarray(endian=endian)
	for x in str:
		bits.frombytes(bytes(x,encoding="utf-8"))
	return bits

# 将原字符串按照哈夫曼编码进行压缩
def zipTobits(s,encodeList,endian="little"):
	bits = bitarray(endian=endian)
	for x in s:
		code = encodeList[x]
		for c in code:
			if c == "0":
				bits.append(False)
			else:
				bits.append(True)
	if len(bits)%8:
	 # 如果不满一个字节,则使用0 填充
	 # 其实如果不填充,因为读入时也是按字节读入,故读时也会自动填充0
		for x in range(0,8-len(bits)%8):
			bits.append(False)
	return bits
# 以二进制的形式保存成文件
def saveBits(bits,encodeList):
	with open("ziped.hfm","wb") as p:
		# 先写字符-哈夫曼编码 字典,再写压缩后的内容
		p.write(dict2bits(encodeList)) 
		p.write(bits)
# 将原字符串保存下来		
def saveStr(str):
	with open("unziped.hfm","w") as p:
		p.write(str)

def getSeedStr(times):
	str=""
	for x in range(0,times):
		str+=chr(random.randint(ord('a'),ord('z')))
	return str

# a = "abcabcabcabcabcabcddddddddd"
# for x in range(1,10):
# 	a+=a

theStr=getSeedStr(100)	
sortedTimes = countTimes(a)
head=getTree(sortedTimes)
result = {}
encode(head)
bits= zipTobits(theStr,result)
saveBits(bits,result)
saveStr(theStr)

解压

#encoding: utf-8
from bitarray import bitarray
import json
# 从二进制文件读取数据,并返回 字符-哈夫曼编码 字典 与 压缩后的字符串
def readFile(filepath):
	size = 1
	encodeListStr=""
	zipedBits = bitarray(endian="little")
	# 读入  字符-哈夫曼编码 字典 
	with open(filepath,"rb") as f:
		tag_begin = f.read(size).decode()
		if tag_begin != '{':
			print("Data Error")
			return
		encodeListStr+=tag_begin
		content=""
		while content != "}":
			content = f.read(size)
			if content == '':
				print("Data Error")
				return
			content = content.decode()
			encodeListStr += content
		zipedBits.frombytes(f.read())
	return encodeListStr,zipedBits

# 字符-哈夫曼编码 字典是{'a':'00'}形式的,要转为 bitarray的形式,即 {'a':bitarray('00')}
# 以便可以进行判断
def changeToBitArray(encodeList):
	encodeBitArray = {}
	for x in encodeList.keys():
		bits = bitarray(endian="little")
		code = encodeList[x]
		for c in code:
			if c == "0":
				bits.append(False)
			else:
				bits.append(True)
		encodeBitArray[x] = bits
	return encodeBitArray

# 是否匹配
def contain(bits,encodeBitArray):
	flag = False
	char = ''
	for x in encodeBitArray.keys():
		if encodeBitArray[x] == bits:
			flag = True
			char = x
			break
	return flag,char
def saveStr(str):
	with open("decode.hfm","w") as p:
		p.write(str)

# 解压的核心内容
def decode(encodeList,bits):
	waitingBits = bitarray(endian = "little")
	encodeBitArray = changeToBitArray(encodeList)
	decodeStr = ""
	while bits.length() > 0:
		waitingBits.append(bits[0])
		bits = bits[1:]
		flag,char = contain(waitingBits,encodeBitArray)
		# print("waitingBits:%s\tflag:%s\tchar:%s"%(waitingBits,flag,char))
		if flag == True:
			decodeStr+=char
			waitingBits = bitarray(endian = "little")
	return decodeStr

encodeListStr,zipedBits = readFile("ziped.hfm")
encodeList = json.loads(encodeListStr)
decodeStr = decode(encodeList,zipedBits)
saveStr(decodeStr)

主要问题

  1. 不满一个字节填充问题。有一种场景是 当不够一个字节然后填充了 00,但是恰好 00 是某个字符的哈夫曼编码,此时不知如何进行处理
  2. 哈夫曼压缩算法好像只能对字符进行压缩,这里限制在[a-z] 其实也可以不限制,只不过要多做一些处理,比如编码字符范围包括 { }时,要进行反义操作之类的。
  3. 本人不善于算法,所以解压时进行匹配时只能每次都去匹配一下,不知道有没有其他算法可以改善这个情况?希望有大佬能指点指点。

至于第一个问题如果有人能告诉我如何解决那就更好了

猜你喜欢

转载自blog.csdn.net/klli15/article/details/100522240