利用霍夫曼编码文件压缩

一.优先队列实现(最小堆)

也可以直接用STL
template<class T>//最小堆实现优先队列
class pQueue
{
private:
	T * heap;//根指针
	int Max;//最大容量
	int size;//当前结点数
public:
	pQueue(int max=10);//构造
	void heapify(int i);//自底向上调整
	void Build(T* A,int n);//建树
	int LEFT(int i);//返回左结点编号
	int RIGHT(int i);//右
	int PARENT(int i);//父
	void swap(T& a, T& b);//交换
	T MAX();//返回根节点并移除
	void Insert(T key);//插入
	void Out(int i);//输出树(中序)
};
template<class T>
int pQueue<T>::LEFT(int i)
{
	return 2 * i;
}
template<class T>
int pQueue<T>::PARENT(int i)
{
	return i / 2;
}
template<class T>
int pQueue<T>::RIGHT(int i)
{
	return 2 * i + 1;
}
template<class T>
void pQueue<T>::swap(T& a, T& b)
{
	T t = a;
	a = b;
	b = t;
}
template<class T>
pQueue<T>::pQueue(int max)
{
	size = 0;
	heap = nullptr;
	heap = new T[max+1];
	memset(heap, 0, sizeof(T)*(max+1));
	Max = max;
}
template<class T>
void pQueue<T>::heapify(int i)//调整采用自顶向下,将大的元素往下移,参考自算法导论
{
	int low = i;//找出最小的结点
	int l = LEFT(i);
	int r = RIGHT(i);
	if (l <= size && heap[l] < heap[low])
		low = l;
	if (r <= size && heap[r] < heap[low])
		low = r;
	if (low != i)
	{
		swap(heap[i], heap[low]);//将最小的结点上移
		heapify(low);
	}
}
template<class T>
void pQueue<T>::Build(T* A,int n)//建立堆
{
	size = n;
	if (size > Max)//扩大容量
	{
		T* S = new T[2 * size + 1];
		memset(S, 0, sizeof(T) * (2 * size +1));
		for (int i = 1; i <= size; i++)
			S[i] = A[i];
		delete heap;
		heap = S;
		Max = 2 * size;
	}
	for (int i = n / 2; i >= 1; i--)//从最后的非叶节点开始调整
	{
		heapify(i);
	}
}
template<class T>
T pQueue<T>::MAX()//取出最小的元素(虽然函数吗=名是MAX)
{
	T M = heap[1];
	heap[1] = heap[size];
	size--;
	heapify(1);//将最后的结点移动到根结点,再调整。
	return M;
}
template<class T>
void pQueue<T>::Insert(T key)//插入结点
{
	size++;
	if (size > Max)//扩大容量
	{
		T* S = new T[2 * size + 1];
		memset(S, 0, sizeof(T) *(2 * size + 1 ));
		for (int i = 1; i < size; i++)
			S[i] = heap[i];
		delete heap;
		heap = S;
		Max = 2 * size;
	}
	heap[size] = key;//插入最后
	int i = size;
	while (i > 1 && heap[PARENT(i)] > heap[i])//自底向上调整
	{
		swap(heap[i], heap[PARENT(i)]);
		i = PARENT(i);//向上
	}
}
template<class T>
void pQueue<T>::Out(int i)//中序输出
{
	cout << "( ";
	if (LEFT(i) <= size)
		Out(LEFT(i));
	cout << " <- " << heap[i] << " -> ";
	if (RIGHT(i) <= size)
		Out(RIGHT(i));
	cout << " )";
}

二.霍夫曼编码:

简单记一下什么是霍夫曼编码。对于一些字符,出现的频率分别为W1, W2, W3, ..., Wn;对这些字符使用二进制编码,编码的长分别为L1, L2ML3, ...Ln。霍夫曼编码可以使 W1*L1 + W2 * L2 + W3 * L3 + ... + Wn * Ln 最小。霍夫曼编码利用二叉树实现,最初有n个结点对应你、个字符,权值分别为对应的频率;之后每次从已有的子树种挑选两个权值最小的组成新的子树,两个子树共用一个新生成的根结点产生新子树,新子树的权值为原两个子树的权值之和,最终从根结点到叶节点路径长即为编码长,可规定右侧取1编码,左侧取0编码或其他方式。(叶节点一定对应字符)
#include<vector>
#include<fstream>
#include<map>
using namespace std;
struct TNode//霍夫曼树结点
{
	char data;//字符
	double weight;//频率
	int l;//左结点编号
	int r;//右结点编号
	int pre;//父节点编号
	int index;//编号
	friend bool operator>(TNode a, TNode b);//比较
	friend bool operator<(TNode a, TNode b);//比较
};
class Huffman
{
private:
	int numLeaf;//树的叶节点树,即字符种类树
	int numNode;//树的节点数
	TNode* Tree;//霍夫曼树
	vector<char> data;//字符种类集
	vector<int> W;//频率集
	vector<char*>S;//S[i]指第i种字符的编码
	map<char, int> M;//字符v是第几种  例:   index=M[v],则S[index]为v的编码
public:
	Huffman();
	void Bulid(vector<int> W, vector<char> v, int n);//建树
	void HuffmanCode();//编码
	int visTreeindex(int i);
	int visTreeLindex(int i);
	int visTreeRindex(int i);
	char visTreechar(int i);
	char visS(int x, int y);//访问S
	int visM(char c);//访问M
	~Huffman();//析构
};

函数定义:

#include"Huffman_.h"
#include"Queue_P.h"
#include<iostream>
using namespace std;
bool operator>(TNode a, TNode b)
{
	return a.weight > b.weight;
}
bool operator<(TNode a, TNode b)
{
	return a.weight < b.weight;
}
Huffman::Huffman()
{
	Tree = nullptr;
	numLeaf = 0;
	numNode = 0;
}
void Huffman::Bulid(vector<int> We, vector<char> v, int n)
{
	W = We;
	data = v;
	delete[] Tree;
	numLeaf = n;
	numNode = 2 * n - 1;
	Tree = new TNode[numNode + 10];
	for (int i = 0; i < numNode; i++)
	{
		Tree[i].pre = 0;
		Tree[i].l = 0;
		Tree[i].r = 0;
		Tree[i].weight = 0;
		Tree[i].data = '\0';
		Tree[i].index = i;
	}
	pQueue<TNode> Q(100);
	for (int i = 0; i < numLeaf; i++)
	{
		Tree[i].weight = W[i];
		Tree[i].data = data[i];
		Q.Insert(Tree[i]);
		M.insert(pair<char, int>(data[i], i));//叶结点压入优先队列
	}

	for (int i = numLeaf; i < numNode; i++)
	{
		TNode a = Q.MAX();//取出频率最小的两个结点
		TNode b = Q.MAX();
		int pa = a.index;
		int pb = b.index;
		Tree[pa].pre = i;
		Tree[pb].pre = i;
		Tree[i].l = pa;
		Tree[i].r = pb;
		Tree[i].weight = Tree[pa].weight + Tree[pb].weight;
		Q.Insert(Tree[i]);
	}
}
void Huffman::HuffmanCode()
{
	//在建好树的基础上编码
	int c;
	int p;
	int start;
	char* t = new char[numLeaf];
	t[numLeaf - 1] = '\0';
	for (int i = 0; i < numLeaf; i++)
	{
		start = numLeaf-1;
		c = i;
		p = Tree[c].pre;
		while (p > 0)
		{
			start--;
			if (Tree[p].l == c)
			{
				t[start] = '0';
			}
			else
				t[start] = '1';
			c = p;
			p = Tree[c].pre;
		}
		char* s = new char[numLeaf - start];
		S.push_back(s);
		strcpy_s(S[i], (numLeaf - start),t+start);
	}
}
Huffman::~Huffman()
{
	for (int i = 0; i < numLeaf; i++)
	{
		delete[] S[i];
		S[i] = nullptr;
	}
	delete[] Tree;
}
char Huffman::visS(int x, int y)
{
	return S[x][y];
}
int Huffman::visM(char c)
{
	return M[c];
}
int Huffman::visTreeindex(int i)
{
	return Tree[i].index;
}
char Huffman::visTreechar(int i)
{
	return Tree[i].data;
}
int Huffman::visTreeLindex(int i)
{
	return Tree[i].l;
}
int Huffman::visTreeRindex(int i)
{
	return Tree[i].r;
}


三.压缩:

将文本编码为二进制数,可能最后二进制位数不是8的倍数,要补0。向压缩后的文本写入字符对应的编码的编码后的二进制数据。以及二进制数个数和补0个数。
读取文本->计算频率和种类->建立霍夫曼树->向文件写入不同种类字符及其频率->编码为二进制,写入。
#include"Huffman_.h"
#include<vector>
#include<fstream>
#include<iostream>
#include<set>
using namespace std;
class ZIP
{
private:
	Huffman H;
	vector<int> weight;//源文本中各种种类字符对应频率,与CHAR位置对应
	vector<char> CHAR;//源文本中各种种类字符,以第一次出现顺序压入
	vector<char> DATA;//原文本中所有字符一次压入
	char* filename;//文件名
	char* write_filename;//压缩文件文件名
	int numChar;//字符种类个数
	long long int ALLCHAR;//一共有多少个字符
public:
	void Read_Txt();//读取文本,对DATA操作
	void ComInit();//建立霍夫曼树
	void Count();//计算频率与种类,对CHAR,weight操作
	void Compress();//压缩  读取文本->计算频率和种类->建立霍夫曼树->向文件写入不同种类字符对应的编码->编码为二进制,再以二进制形式打开文件,写入
	ZIP(char* s1, char* s2);//构造
	~ZIP();//析构
};

函数定义:

#include"ZIP_.h"
ZIP::ZIP(char* s1 ,char* s2) :H()
{
	filename = s1;
	write_filename = s2;
	numChar = 0;
	ALLCHAR = 0;
}
void ZIP::ComInit()
{
	H.Bulid(weight, CHAR, numChar);//建树
	H.HuffmanCode();//对不同字符编码
}
void ZIP::Read_Txt()//读取文本
{
	char ch;
	fstream os;
	os.open(filename, ios::in|ios::binary);
	while (os.peek() != EOF)//文件尾判断
	{
		os.get(ch);//获取一个字符
		DATA.push_back(ch);//压入DATA
		ALLCHAR++;//总字符数+1
	}
	os.close();
}
void ZIP::Count()
{
	set<char> M;//查重用
	for (int i = 0; i < DATA.size(); i++)
	{
		if (M.find(DATA[i]) == M.end())//字符DATA[i]第一次出现
		{
			CHAR.push_back(DATA[i]);//压入字符种类集
			weight.push_back(count(DATA.begin(), DATA.end(), DATA[i]));//计算频率
			numChar++;//字符种类数+1
			M.insert(DATA[i]);//压入映射,便于查重
		}
	}
}
void ZIP::Compress()//压缩
{
	Read_Txt();//读取文本
	Count();//统计字符
	ComInit();
	ofstream fout(write_filename, ios::out|ios::binary);
	fout.write((char*)&numChar, sizeof(int));
	for (int i = 0; i < numChar; i++)
	{
		fout.write((char*)&CHAR[i], sizeof(unsigned char));
		fout.write((char*)&weight[i], sizeof(int));
	}
	char* ANS = new char[100000000];//存储二进制文本
	memset(ANS, 0, sizeof(char) * 10000000);
	int now = 7;
	long long int pos = 0;//压缩后八位二进制编码的个数
	for (long long int i = 0; i < ALLCHAR; i++)
	{
		int k = 0;
		while (H.visS(H.visM(DATA[i]), k) != '\0')
		{
			ANS[pos] |= ((int((H.visS(H.visM(DATA[i]), k) - '0'))) << (now));//从高位到地位生成二进制数并写入ANS
			now--;
			if (now < 0)
			{
				now = 7;
				pos++;
			}
			k++;
		}
	}
	int zero = 0;
	if (now != 7)//二进制位数不为八的倍数,补0
	{
		zero = now % 7 + 1;
		pos++;
	}
	fout.write((char*)&zero, sizeof(int));
	fout.write((char*)&pos, sizeof(long long int));
	ANS[pos] = '\0';
	fout.write(ANS, pos);
	delete[] ANS;
}
ZIP::~ZIP()
{

}

四.解压:

class DeZIP
{
private:
	Huffman H;
	vector<int> weight;//各种种类字符对应频率,与CHAR位置对应
	vector<char> CHAR;//各种种类字符,以第一次出现顺序压入
	char* filename;//文件名
	char* write_filename;//解压文件文件名
	long long int size;//压缩后八位二进制编码的个数
	char* str;//读取二进制块
	int numChar;//字符种类数
	int zero;//补零个数
public:
	void Read_Txt();//读取文本,建树
	void DeCompress();//依据字符和频率重新建立霍夫曼树与二进制解压
	DeZIP(char* s1,char* s2);//构造
	~DeZIP();//析构
};

函数定义:
#include"ZIP_.h"
void DeZIP::Read_Txt()//读取文本
{
	ifstream fin(filename, ios::in | ios::binary);//打开文件
	fin.read((char*)&numChar, sizeof(int));
	char c;
	int w;
	for (int i = 0; i < numChar; i++)
	{
		fin.read((char*)&c, sizeof(unsigned char));
		fin.read((char*)&w, sizeof(int));
		CHAR.push_back(c);
		weight.push_back(w);
	}
	fin.read((char*)&zero, sizeof(int));
	fin.read((char*)&size, sizeof(long long int));
	H.Bulid(weight, CHAR, numChar);//建树
	H.HuffmanCode();//对不同字符编码
	str = new char[size + 100];
	fin.read(str, size);
}
void DeZIP::DeCompress()//解压
{
	Read_Txt();//读取被压缩的文本
	long long end = 0;//每一个八位二进制读取从哪里结束
	int root = 2*numChar-2;
	ofstream out(write_filename, ios::out || ios::binary);
	for (int i = 0; i < size; i++)
	{
		char v = str[i];//一个二进制数对应的字符
		if (i == size - 1)//被补0的二进制,特殊设置end
			end = zero;
		for (int j = 7; j >= end; j--)//从高位开始读取二进制数
		{
			if ((1 << j)&v)//1对应右子树
			{
				root = H.visTreeRindex(root);
			}
			else//0对应左子树
			{
				root = H.visTreeLindex(root);
			}
			if (H.visTreeRindex(root) == 0 && H.visTreeLindex(root) == 0)//1.到达叶节点 2.开始解码下一个字符(可能还在同一个二进制数中)
			{
				char c = H.visTreechar(root);
				out.write(&c, sizeof(unsigned char));
				root = 2 * numChar - 2;;//在树上自顶向下解码
			}
		}
	}
	out.close();
}
DeZIP::DeZIP(char* s1,char* s2)
{
	filename = s1;
	write_filename = s2;
}
DeZIP::~DeZIP()
{
	delete[] str;
}




测试一下:

while (1)
	{
		cout << "压缩文件请按1,解压文件请按2,退出请按3 ";
		int press;
		cin >> press;
		char* s1 = new char[100];
		char* s2 = new char[100];
		if (press == 1)
		{
			cout << "请输入文件路径或程序文件下文件名: ";
			cin >> s1;
			cout << "请输入压缩后文件名: ";
			cin >> s2;
			ZIP Z( s1,s2);
			Z.Compress();
			delete[] s1;
			delete[] s2;
		}
		else if (press == 2)
		{
			cout << "请输入文件路径或程序文件下文件名: ";
			cin >> s1;
			cout << "请输入解压后文件名: ";
			cin >> s2;
			DeZIP DZ(s1,s2);
			DZ.DeCompress();
			delete[] s1;
			delete[] s2;
		}
		else
		{
			delete[] s1;
			delete[] s2;
			break;
		}
	}


1.31 MB (1,376,256 字节)
压缩
728 KB (745,472 字节)
压缩率:1.846


猜你喜欢

转载自blog.csdn.net/qq_40510553/article/details/80186294