哈夫曼编码原理解析及算法构造过程

一.哈夫曼编码

哈夫曼编码(Huffman Coding)，又称霍夫曼编码，是一种编码方式，哈夫曼编码是可变字长编码(VLC)的一种。Huffman于1952年提出一种编码方法，该方法完全依据字符出现概率来构造异字头的平均长度最短的码字，有时称之为最佳编码，一般就叫做Huffman编码（有时也称为霍夫曼编码）

二.构造原理

要构造二叉树（哈夫曼树），二叉树的叶子节点才是用来存储数据的节点，取出的具体数据由给出的编码确定

每一组数据对应一个哈夫曼树不同文件的哈夫曼树一般是不一样的

三.构造方法

构造哈夫曼树

每一次取出出现概率（权值）最小的两个节点作为新节点的子节点（小左大右），新节点的权值即左右子节点权值之和，然后将新的节点放入原数据集合中，递归

译码

从建立好的哈夫曼树的头节点开始查找，找到叶子节点即返回一个数据，然后重新开始查找

（不能用其他的树的编码来完成译码，容易出现内存访问错误）

四.具体算法实现（C++）

首先确定数据符号有哪些以及每一种符号权值是多少（此处不做解释）

本算法以如下数据构件哈夫曼树

char c[8]={'a','b','c','d','e','f','g','h'};
//数据集合
int w[8]={5,29,7,8,14,23,3,11};
//对应权值
void sort(){
    //对应排序（此处冒泡）
	for(int i=0;i<7;i++){
		for(int j=0;j<7;j++){
			if(w[j]>w[j+1]){
				int t1=w[j];
				char t2=c[j];
				c[j]=c[j+1];
				w[j]=w[j+1];
				c[j+1]=t2;
				w[j+1]=t1;
			}
		}
	}
}

由于用到二叉树，基础节点定义如下：

struct ht{
	char data;
	int weight;
	ht *lc;//左孩子
	ht *rc;//右孩子
};

由于重复对数据进行排序，时间复杂度较高，故采用数组先排序，然后建立对应有序链表，每一次建立一个根节点就插入即可

struct l{
	ht *data;
	l *last;
	l *next;
}; //双链表结构

ht *h;//全局变量
void insert(ht *p,l *e){
    //插入算法
    //以待插入数据节点和链表头指针作为参数
	l *t=e;
	while(t->next!=NULL){
		if(p->weight<t->data->weight){
			l *n=(l *)malloc(sizeof(l));
			n->data=p;
			n->last=t->last;
			t->last->next=n;
			n->next=t;
			t->last=n;
			return;
		}
		t=t->next;
	}
	if(p->weight<t->data->weight){
		l *n=(l *)malloc(sizeof(l));
		n->data=p;
		n->last=t->last;
		t->last->next=n;
		n->next=t;
		t->last=n;
		return;
	}
	else{
		l *n=(l *)malloc(sizeof(l));
		n->data=p;
		n->last=t;
		n->next=NULL;
		t->next=n;
		return;
	}
    //各种条件使数据插入该插入的位置（此处不再解释）
}
void creat(l *&head){
    //创建树
    //以链表头指针作为参数
	if(head->next==NULL)return ;
    //终止构建条件(链表仅有一个节点)
	ht *t=(ht *)malloc(sizeof(ht));
	t->data='#';
    //此步骤可省略
	t->weight=head->data->weight+head->next->data->weight;
    //新节点权值
	t->lc=head->data;
	t->rc=head->next->data;
	insert(t,head);
	head=head->next->next;
    //取完数据指针后移两个
	creat(head);
}

设置读取方式

void find(ht *t,char *s){
	if(t->lc==NULL&&t->rc==NULL){
        //是叶子节点，得到数据，从头再来
		cout<<t->data;
		find(h,s);
	}
	else{
		if(*s=='0'){
			//0表示左，1表示右
			find(t->lc,++s);
		}
		else if(*s=='1'){
			find(t->rc,++s);
		}//其他情况终止查找
	}
}

检验

//测试数据如下：000100011011101110
int main(){
	sort();
	l *head=NULL;
	head=(l *)malloc(sizeof(l));
	l *temp=head;
	ht *p=(ht *)malloc(sizeof(ht));
	p->data=c[0];
	p->weight=w[0];
	p->lc=NULL;
	p->rc=NULL;
	temp->data=p;
	temp->last=NULL;
	temp->next=NULL;
	temp=temp->next;
	l *r=head;
	for(int i=1;i<8;i++){
		p=(ht *)malloc(sizeof(ht));
		temp=(l *)malloc(sizeof(l));
		temp->last=r;
		r->next=temp;
		temp->next=NULL;
		p->data=c[i];
		p->weight=w[i];
		p->lc=NULL;
		p->rc=NULL;
		temp->data=p;
		temp=temp->next;
		r=r->next;
	}
	creat(head);
	p=head->data;
	h=p;
	char s[10000];
	cin>>s;
	find(h,s);
	return 0;
}