利用哈夫曼编码压缩文本

文章目录

使用哈夫曼编码进行压缩文本

文本内容
读取文件内容至内存中
遍历文本内容，获取每个字符对应出现概率值
建立哈夫曼树
获取哈夫曼编码
将转换后的编码写入新文件
检测压缩率
利用编码文件进行还原文本

完整code

使用哈夫曼编码进行压缩文本

文本内容

Even though neural network-based models have achieved
a lot, there are several important things that have not been
taken into consideration. Firstly, as shown in Figure 1, each
block is represented as a low-dimensional embedding with
manually selected features, which will cause the loss of
much semantic information. Secondly, the order of the nodes
plays an important role in representing binary functions,
while previous approaches did not design methods to extract
it. To solve these two problems, we propose an overall
framework with three components: semantic-aware modeling,
structural-aware modeling, and order-aware modeling.

读取文件内容至内存中

char *read_file(const char *filename)     ///从文件中读取字符串
{
    FILE *fp = NULL;
    char buf[256];
    int len = 0;
    char *x = new char [10000];
    *x = '\0';

    if((fp = fopen(filename, "r"))==NULL)
    {
        perror("can't open the file");
        exit(1);
    }
    while(fgets(buf, 255, fp) != NULL)
    {
        len = strlen(buf);
        //printf("%d", len);
        if(buf[len-1] == '\n')
        {
            buf[len-1] = '\0';
        }
        //printf("%s\n", buf);
        strcat(x, buf);
    }
    //printf("%s\n", x);
    fclose(fp);
    return x;
}

遍历文本内容，获取每个字符对应出现概率值

int samechar(char *x)     ///该函数将不同字符出现的次数记录下来
{
    int flag;
    int n=0;
    tree_huffman[n].c = x[0];
    tree_huffman[n].w = 1.0;
    int l=strlen(x);
    for(int i=1;i<l;i++)
    {
        int j;
        flag=0;
        for(j=0;j<=n;j++)
        {
            if(x[i]==tree_huffman[j].c)
            {
                flag=1;
                tree_huffman[j].w++;
                break;
            }
        }
        if(!flag)
        {
            n++;
            tree_huffman[n].c=x[i];
            tree_huffman[n].w=1.0;
        }
    }
    return n+1;
}

建立哈夫曼树

首先给所有节点的权值由小到大排序；
取出最小的两个节点，将它们的权值相加，得到一个新节点
使用这个新节点代替两个原来的节点
继续排序，替换
直到整个队列中只有一个节点
这就构成了一棵哈夫曼树

int cmp(Tree a, Tree b)
{
    return a.w<b.w;
}

int create_huffman(int n)   ///构建哈夫曼树
{
    int rear = n, font = 0;
    Tree last;
    while(rear > font)
    {
        sort(tree_huffman+font, tree_huffman+rear, cmp);
        last = tree_huffman[font];
        last.l = font;
        font++;
        if (rear <= font)
            break;
        else
        {
            Tree t = tree_huffman[font];
            last.w += t.w;
            last.c = '\0';
            last.r = font;
            last.par = -1;
            font++;
            tree_huffman[rear++] = last;
        }
    }
    for(int i=0;i<rear;++i)
    {
        if(tree_huffman[i].l >= 0)
        {
            int lchild = tree_huffman[i].l;
            tree_huffman[lchild].par = i;
        }
        if(tree_huffman[i].r >= 0)
        {
            int rchild = tree_huffman[i].r;
            tree_huffman[rchild].par = i;
        }
    }
    //for(int i=0;i<rear;++i)
        //printf("%d, %c,%.2lf,%d,%d,%d\n", i, tree_huffman[i].c, tree_huffman[i].w, tree_huffman[i].l, tree_huffman[i].r, tree_huffman[i].par);
    return rear;

}

获取哈夫曼编码

采用递归获取哈夫曼编码
需要注意一个问题：使用char *进行递归时，系统只会给char *开辟一块内存
这样造成哈夫曼编码前缀相同
解决问题方法：换用string，string每次都开辟新内存来递归；或每次递归时给char *开辟新内存

void output_tree_code(Tree &temp, char *code)
{
    if(temp.l == -1 && temp.r == -1)
    {
        printf("%s\n", code);
        strcpy(temp.flag, code);
        return;
    }
    char t1[256], t2[256];
    strcpy(t1, code);
    strcpy(t2, code);
    output_tree_code(tree_huffman[temp.l], strcat(t1, "0"));
    output_tree_code(tree_huffman[temp.r], strcat(t2, "1"));
}

将转换后的编码写入新文件

将哈夫曼编码每八位截取，不足8位的后面补零
再将单个字节写入二进制文件

void wfile(const char *filename1, const char *filename2, int n)
{
    FILE *fp = NULL;
    fp = fopen(filename2, "wb");
    char *buf = read_file(filename1);
    char *temp = new char [10000];
    temp[0] = '\0';
    for(int i=0;i<int(strlen(buf));++i)
    {
        for(int j=0;j<n;++j)
        {
            if(buf[i] == tree_huffman[j].c)
            {
                strcat(temp, tree_huffman[j].flag);
                break;
            }
        }
    }
    printf("%s\n", temp);
    int len = 0;
    if(strlen(temp) % 8 == 0)
        len = strlen(temp) / 8;
    else
    {
        len = strlen(temp) / 8 + 1;
        while(strlen(temp) % 8 != 0)
        {
            strcat(temp, "0");
        }
    }

    for(int i=0;i<len;++i)
    {
        int a = 0;
        for(int j=0;j<8;++j)
        {
            int ca = temp[i*8+j] - '0';
            a += ca * pow(2, 7-j);
            //printf("%d\t", ca * pow(2, 8-j));
        }
        fwrite(&a, 1, 1, fp);
    }
    fclose(fp);
}

检测压缩率

查看压缩文件2.huffman的大小

查看原文件大小

342/636 = 53%，也就是说压缩率为47%，压缩了将近一半的大小，就压缩率而言，相当可观。

利用编码文件进行还原文本

读取哈夫曼编码还原文本；
从第一个字符开始读，如果为0，则向根节点的左子树走，如果为1，则向根节点的右子树走；
每读取一个字符，进行一次移动操作；
直到移动至叶子节点，该节点的左右子树均为空，递归结束，输出该节点的字符；
继续读取下一个字符；
直到所有字符读取完毕；

void unzip_huffman(const char *filename, int root)
{
    FILE *fp = NULL;
    if((fp = fopen(filename, "rb"))==NULL)
    {
        perror("can't open the file");
        exit(1);
    }
    int cb[10000];
    int num = 0;
    while(!feof(fp))
    {
        int element = getc(fp);
        //printf("%d\t", element);
        if(element != -1)
        {
            for(int i=0;i<8;++i)
            {
                int t = element % 2;
                element = element / 2;
                cb[num++] = t;
            }
        }
    }
    int ccb[10000];
    int xnum = 0;
    for(int i=0;i<num/8;++i)
    {
        for(int j=0;j<8;++j)
        {
            ccb[xnum++] = cb[i*8+7-j];
        }
    }
    fclose(fp);
    char ss[10000] = "\0";
    int sr = 0;
    num = 0;
    int rt = root;
    while(num < xnum)
    {
        while(tree_huffman[rt].l != -1 && tree_huffman[rt].r != -1)
        {
            if(ccb[num] == 0)
            {
                rt = tree_huffman[rt].l;
            }
            else if(ccb[num] == 1)
            {
                rt = tree_huffman[rt].r;
            }
            num++;
        }
        ss[sr++] = tree_huffman[rt].c;
        //printf("%c", tree_huffman[rt].c);
        rt = root;

    }
    printf("%s\n", ss);
}

完整code

#include<iostream>
#include<algorithm>
#include<cstring>
#include<string>
#include<stdio.h>
#include<stdlib.h>
#include<cmath>
using namespace std;

typedef struct haffman
{
    char c;///标记字符
    char flag[256]; ///标记哈夫曼编码
    double w;///标记权值
    int l, r;///左孩子有孩子
    int par;///标记是否存在父母节点
}Tree;

Tree tree_huffman[128]; ///ascii表共128个字符


char *read_file(const char *filename)     ///从文件中读取字符串
{
    FILE *fp = NULL;
    char buf[256];
    int len = 0;
    char *x = new char [10000];
    *x = '\0';

    if((fp = fopen(filename, "r"))==NULL)
    {
        perror("can't open the file");
        exit(1);
    }
    while(fgets(buf, 255, fp) != NULL)
    {
        len = strlen(buf);
        //printf("%d", len);
        if(buf[len-1] == '\n')
        {
            buf[len-1] = '\0';
        }
        //printf("%s\n", buf);
        strcat(x, buf);
    }
    //printf("%s\n", x);
    fclose(fp);
    return x;
}

int samechar(char *x)     ///该函数将不同字符出现的次数记录下来
{
    int flag;
    int n=0;
    tree_huffman[n].c = x[0];
    tree_huffman[n].w = 1.0;
    int l=strlen(x);
    for(int i=1;i<l;i++)
    {
        int j;
        flag=0;
        for(j=0;j<=n;j++)
        {
            if(x[i]==tree_huffman[j].c)
            {
                flag=1;
                tree_huffman[j].w++;
                break;
            }
        }
        if(!flag)
        {
            n++;
            tree_huffman[n].c=x[i];
            tree_huffman[n].w=1.0;
        }
    }
    return n+1;
}

int cmp(Tree a, Tree b)
{
    return a.w<b.w;
}

int create_huffman(int n)   ///构建哈夫曼树
{
    int rear = n, font = 0;
    Tree last;
    while(rear > font)
    {
        sort(tree_huffman+font, tree_huffman+rear, cmp);
        last = tree_huffman[font];
        last.l = font;
        font++;
        if (rear <= font)
            break;
        else
        {
            Tree t = tree_huffman[font];
            last.w += t.w;
            last.c = '\0';
            last.r = font;
            last.par = -1;
            font++;
            tree_huffman[rear++] = last;
        }
    }
    for(int i=0;i<rear;++i)
    {
        if(tree_huffman[i].l >= 0)
        {
            int lchild = tree_huffman[i].l;
            tree_huffman[lchild].par = i;
        }
        if(tree_huffman[i].r >= 0)
        {
            int rchild = tree_huffman[i].r;
            tree_huffman[rchild].par = i;
        }
    }
    //for(int i=0;i<rear;++i)
        //printf("%d, %c,%.2lf,%d,%d,%d\n", i, tree_huffman[i].c, tree_huffman[i].w, tree_huffman[i].l, tree_huffman[i].r, tree_huffman[i].par);
    return rear;

}

void output_tree_code(Tree &temp, char *code)
{
    if(temp.l == -1 && temp.r == -1)
    {
        printf("%s\n", code);
        strcpy(temp.flag, code);
        return;
    }
    char t1[256], t2[256];
    strcpy(t1, code);
    strcpy(t2, code);
    output_tree_code(tree_huffman[temp.l], strcat(t1, "0"));
    output_tree_code(tree_huffman[temp.r], strcat(t2, "1"));
}

void wfile(const char *filename1, const char *filename2, int n)
{
    FILE *fp = NULL;
    fp = fopen(filename2, "wb");
    char *buf = read_file(filename1);
    char *temp = new char [10000];
    temp[0] = '\0';
    for(int i=0;i<int(strlen(buf));++i)
    {
        for(int j=0;j<n;++j)
        {
            if(buf[i] == tree_huffman[j].c)
            {
                //printf("%s\n", tree_huffman[j].flag);
                strcat(temp, tree_huffman[j].flag);
                //fwrite(tree_huffman[j].flag, (1.0/8.0)*strlen(tree_huffman[j].flag), 1, fp);
                break;
            }
        }
    }
    printf("%s\n", temp);
    /*
    for(int i=0;i<8;++i)
    {
        int len = strlen(temp);
        if(len % 8 != 0)
        {
            strcat(temp, "0");
        }
        else
            break;
    }*/
    //printf("%s\n", temp);
    int len = 0;
    if(strlen(temp) % 8 == 0)
        len = strlen(temp) / 8;
    else
    {
        len = strlen(temp) / 8 + 1;
        while(strlen(temp) % 8 != 0)
        {
            strcat(temp, "0");
        }
    }

    for(int i=0;i<len;++i)
    {
        int a = 0;
        for(int j=0;j<8;++j)
        {
            int ca = temp[i*8+j] - '0';
            a += ca * pow(2, 7-j);
            //printf("%d\t", ca * pow(2, 8-j));
        }
        //printf("\n");
        //printf("%d\n", a);
        fwrite(&a, 1, 1, fp);
    }
    fclose(fp);
}

void unzip_huffman(const char *filename, int root)
{
    FILE *fp = NULL;
    if((fp = fopen(filename, "rb"))==NULL)
    {
        perror("can't open the file");
        exit(1);
    }
    int cb[10000];
    int num = 0;
    while(!feof(fp))
    {
        int element = getc(fp);
        //printf("%d\t", element);
        if(element != -1)
        {
            for(int i=0;i<8;++i)
            {
                int t = element % 2;
                element = element / 2;
                cb[num++] = t;
            }
        }
    }
    //printf("\n");
    //printf("cb:\n");
    //for(int i=0;i<num;++i)
        //printf("%d ", cb[i]);
    //printf("\n");
    int ccb[10000];
    int xnum = 0;
    for(int i=0;i<num/8;++i)
    {
        for(int j=0;j<8;++j)
        {
            ccb[xnum++] = cb[i*8+7-j];
        }
    }
    //for(int i=0;i<xnum;++i)
        //printf("%d ", ccb[i]);
    fclose(fp);
    char ss[10000] = "\0";
    int sr = 0;
    num = 0;
    int rt = root;
    while(num < xnum)
    {
        while(tree_huffman[rt].l != -1 && tree_huffman[rt].r != -1)
        {
            if(ccb[num] == 0)
            {
                rt = tree_huffman[rt].l;
            }
            else if(ccb[num] == 1)
            {
                rt = tree_huffman[rt].r;
            }
            num++;
        }
        ss[sr++] = tree_huffman[rt].c;
        //printf("%c", tree_huffman[rt].c);
        rt = root;

    }
    printf("%s\n", ss);
}

int main()
{

    ios::sync_with_stdio(false);
    const char *filename = "doc1.txt";
    char *st = read_file(filename);
    printf("input file's reading:\n");
    printf("%s\n", st);
    int n = samechar(st);
    printf("the char kinds of %d\n", n);
    for(int i=0;i<n;++i)
    {
        //printf("%c", tree_huffman[i].c);
        tree_huffman[i].w /= strlen(st);
        tree_huffman[i].l = -1;
        tree_huffman[i].r = -1;
        tree_huffman[i].par = -1;
        memset(tree_huffman[i].flag, 0, sizeof(tree_huffman[i].flag));
        //printf("%.2lf\n", tree_huffman[i].w);
    }
    n = create_huffman(n);
    char code[256] = "\0";
    //printf("%d", n);
    output_tree_code(tree_huffman[n-1], code);
    for(int i=0;i<n;++i)
    {
        printf("%d, %c,%.2lf,%d,%d,%d, ", i, tree_huffman[i].c, tree_huffman[i].w, tree_huffman[i].l, tree_huffman[i].r, tree_huffman[i].par);
        printf("%s\n", tree_huffman[i].flag);
        //cout<<tree_huffman[i].flag<<endl;
    }
    wfile(filename, "2.huffman", n);

    unzip_huffman("2.huffman", n-1);
    return 0;
}