基于哈夫曼树的文件压缩

基本思想：
压缩：
1、统计出文件中相同字符出现的次数
2、获取哈夫曼编码
次数作为权值构建哈夫曼树
3、重新编码，写回压缩文件
保存头文件：
源文件后缀
编码信息的行数
每个字符的权
保存编码

解压缩：
1、获取原文件后缀
2、获取每个字符出现的次数，即权值
3、利用之前后的的权值，还原哈夫曼树
4、找到对应的叶子节点，将信息保存到解压文件中

在写压缩文件之前，首先需要实现堆和哈夫曼树

1，建堆

#include<iostream>
#include<vector>
using namespace std;
//利用仿函数的特性实现代码的复用性
template<class T>
struct Small
{
    bool operator()(const T& l, const T& r)
    {
        return l < r;
    }
};

template<class T>
struct Large
{
    bool operator()(const T& l, const T& r)
    {
        return l > r;
    }
};

template<class T, class Compare = Large<T>>  //缺省是建小堆
class Heap
{
public:
    Heap()
    {}
    Heap(const T *a, int size)
    {
        assert(a);
        _a.reserve(size);
        for (int i = 0; i<size; ++i)
        {
            _a.push_back(a[i]);
        }
        //建堆的时候从倒数第一个非叶子结点开始.
        for (int j = (size - 2) / 2; j >= 0; --j)
        {
            adjust_down(j);
        }
    }
    void Push(const T& x)
    {
        _a.push_back(x);
        adjust_up(_a.size() - 1);
    }
    void Pop()
    {
        assert(!_a.empty());
        swap(_a[0], _a[_a.size() - 1]);
        _a.pop_back();
        adjust_down(0);
    }
    size_t Size()
    {
        return _a.size();
    }
    bool Empty()
    {
        return _a.empty();
    }
    const T& Top()const
    {
        assert(!_a.empty());
        return _a[0];
    }
    void Display()
    {
        for (size_t i = 0; i<_a.size(); ++i)
        {
            cout << _a[i] << " ";
        }
        cout << endl;
    }
    void adjust_down(int root)
    {
        Compare com;
        int parent = root;
        int child = parent * 2 + 1;//parent的左孩子
        while (child < _a.size())
        {
            /*if rightchild > leftchild,child->right
            while 里面我们已经可以确定child(左孩子下标一定小于size
            但是我们不能保证右孩子的下标小于size，所以if语句里我们
            要判断一下，以免访问越界)
            */
            if (child + 1<_a.size() && com(_a[child + 1], _a[child]))
                //if (child + 1<_a.size() && _a[child + 1] > _a[child])
            {
                ++child;
            }
            if (com(_a[child], _a[parent]))//如果是>则为大堆
                //if (_a[child] > _a[parent])//if child>parent,swap
            {
                swap(_a[child], _a[parent]);
                parent = child;    //让parent指向child，继续向下调整
                child = child * 2 + 1;
            }
            else
                break;
        }
    }
    void adjust_up(int child)
    {
        Compare com;
        size_t parent = (child - 1) >> 1;
        while (child > 0)
        {
            if (com(_a[child], _a[parent]))
                //  if (_a[child] > _a[parent])
            {
                swap(_a[child], _a[parent]);
                child = parent;
                parent = (child - 1) >> 1;
            }
            break;
        }
    }
protected:
    vector<T> _a;
};

2，构建哈弗曼树

#include "heap.h"

template<class T>
struct HuffmanTreeNode
{
    T _weight;
    HuffmanTreeNode<T> *_left;
    HuffmanTreeNode<T> *_right;
    HuffmanTreeNode<T> *_parent;
    HuffmanTreeNode(const T& w = T())
        :_weight(w)          //权值
        , _left(NULL)
        , _right(NULL)
        , _parent(NULL)
    {}
};

template<class T>
class HuffmanTree
{
    typedef HuffmanTreeNode<T> Node;
public:
    HuffmanTree()
        :_root(NULL)
    {}
    HuffmanTree(const T* a, size_t size)
        :_root(NULL)
    {
        //定义一个内部类
        struct NodeLess
        {
            bool operator()(Node *l, Node *r)const
            {
                return l->_weight < r->_weight;
            }
        };
        Heap<Node *, NodeLess> minHeap;
        //建立结点并放入vector中
        for (size_t i = 0; i<size; ++i)
        {
            Node *tmp = new Node(a[i]);
            minHeap.Push(tmp);
        }
        //取出较小的两个结点作为左右孩子并构建父结点
        while (minHeap.Size() > 1)
        {
            Node *left = minHeap.Top();
            minHeap.Pop();
            Node *right = minHeap.Top();
            minHeap.Pop();
            Node *parent = new Node(left->_weight + right->_weight);
            parent->_left = left;
            parent->_right = right;
            left-> = p_parentarent;
            right->_parent = parent;
            minHeap.Push(parent);
        }
        _root = minHeap.Top();
    }

    HuffmanTree(const T* a, size_t size, const T& invalid)
    {

        struct NodeLess
        {
            bool operator()(Node *l, Node *r)const
            {
                return l->_weight < r->_weight;
            }
        };
        Heap<Node *, NodeLess> minHeap;
        //建立结点并放入vector中
        for (size_t i = 0; i<size; ++i)
        {
            if (a[i] != invalid)
            {
                Node *tmp = new Node(a[i]);
                minHeap.Push(tmp);
            }
        }
        //取出较小的两个结点作为左右孩子并构建父结点
        while (minHeap.Size() > 1)
        {
            Node *left = minHeap.Top();
            minHeap.Pop();
            Node *right = minHeap.Top();
            minHeap.Pop();
            Node *parent = new Node(left->_weight + right->_weight);
            parent->_left = left;
            parent->_right = right;
            left->_parent = parent;
            right->_parent = parent;
            minHeap.Push(parent);
        }
        _root = minHeap.Top();
    }
    Node* GetRoot()
    {
        return _root;
    }

    void Destroy(Node* &root)
    {
        if (root == NULL)
            return;
        Destroy(root->_left);
        Destroy(root->_rihgt);
        delete root;
        root = NULL:
        return;
    }
protected:
    Node *_root;
};

3，生产哈夫曼编码，并进行压缩和解压缩

#include<string>
#include<Windows.h>
#include<assert.h>

#include "huffman_tree.h"
using namespace std;


typedef long long Type;
struct CharInfo
{
    unsigned char _ch;     //出现的字符
    Type _count;           //统计次数
    string _code;          //Huffman编码
    CharInfo(Type count = 0)
        :_ch(0)
        , _count(count)
        , _code("")
    {}
    //重载对应的操作符
    CharInfo operator + (const CharInfo& fc)const
    {
        return CharInfo(_count + fc._count);
    }
    bool operator != (const CharInfo fc)const
    {
        return _count != fc._count;
    }
    bool operator < (const CharInfo& fc)const
    {
        return _count < fc._count;
    }
};

class FileCompress
{
protected:
    CharInfo _infos[256];
public:
    //默认的构造函数
    FileCompress()
    {
        for (size_t i = 0; i<256; ++i)
        {
            _infos[i]._ch = i;
        }
    }

    //生成Huffman_code函数
    void GenerateHufffmanCode(HuffmanTreeNode<CharInfo> * root, string code)
    {
        if (root == NULL)return;
        if (root->_left == NULL&&root->_right == NULL)//叶子节点
        {
            _infos[root->_weight._ch]._code = code;
            return;
        }
        GenerateHufffmanCode(root->_left, code + '0');
        GenerateHufffmanCode(root->_right, code + '1');
    }
    string Compress(const char *filename)
    {
        assert(filename);
        FILE *pf = fopen(filename, "rb");
        assert(pf);
        //fgetc函数的作用是意为从文件指针stream指向的文件中读取一个字符，读取一个字节后，光标位置后移一个字节，返回值为他所读到的字符，因为返回值要能表示-1，所以返回值类型是int
        unsigned char ch = fgetc(pf);
        //统计字符出现的次数
        while (!feof(pf))//feof检测文件流上的结束标志
        {
            _infos[ch]._count++;
            ch = fgetc(pf);
        }
        //以该字符出现的次数构建一颗HuffmanTree.
        CharInfo invalid;   //非法值
        HuffmanTree<CharInfo> ht(_infos, 256, invalid);
        //生成Huffman编码
        string code;
        GenerateHufffmanCode(ht.GetRoot(), code);
        //压缩文件
        fseek(pf, 0, SEEK_SET);          //回到文件头
        string compressfile = filename;
        compressfile += ".compress";   //压缩后的文件名
        FILE *fin = fopen(compressfile.c_str(), "wb");
        assert(fin);
        size_t pos = 0;                  //记录位数
        unsigned char value = 0;
        ch = fgetc(pf);
        while (!feof(pf))
        {
            string &code = _infos[ch]._code;
            for (size_t i = 0; i<code.size(); ++i)
            {
                value <<= 1;
                if (code[i] == '1')
                    value |= 1;
                else
                    value |= 0;    //do-nothing
                ++pos;
                if (pos == 8)     //满一个字节
                {
                    fputc(value, fin);
                    value = 0;
                    pos = 0;
                }
            }
            ch = fgetc(pf);
        }
        if (pos)      //解决不足8位的情况.
        {
            value <<= (8 - pos);
            fputc(value, fin);
        }
        //配置文件--便于重建Huffman树
        string configfilename = filename;
        configfilename += ".config";
        FILE *finconfig = fopen(configfilename.c_str(), "wb");
        assert(finconfig);
        string line;
        char buff[128];
        for (size_t i = 0; i<256; ++i)
        {
            //一行一行的读
            if (_infos[i]._count)
            {
                line += _infos[i]._ch;
                line += ",";
                line += _itoa(_infos[i]._count, buff, 10);
                line += "\n";
                //fputs(line.c_str(),finconfig);
                fwrite(line.c_str(), 1, line.size(), finconfig);
                line.clear();
            }
        }
        fclose(pf);
        fclose(fin);
        fclose(finconfig);
        return compressfile;
    }
    string UnCompress(const char *filename)
    {
        assert(filename);
        string configfilename = filename;
        size_t index = configfilename.rfind(".");
        configfilename = configfilename.substr(0, index);
        configfilename += ".config";
        FILE *foutconfig = fopen(configfilename.c_str(), "rb");
        assert(foutconfig);
        string line;
        //读取配置文件--获取字符出现的次数
        unsigned char ch = 0;
        while (ReadLine(foutconfig, line))
        {
            if (line.empty())
            {
                line += '\n';
                continue;
            }
            //读到空行
            ch = line[0];
            _infos[ch]._count = atoi(line.substr(2).c_str());
            line.clear();
        }
        //构建Huffman树
        CharInfo invalid;
        HuffmanTree<CharInfo> hft(_infos, 256, invalid);
        //根结点的权值也就是字符出现的次数总和
        HuffmanTreeNode<CharInfo> *root = hft.GetRoot();
        Type charcount = root->_weight._count;
        //解压缩
        string uncompressfilename = filename;
        index = uncompressfilename.rfind(".");
        uncompressfilename = uncompressfilename.substr(0, index);
        uncompressfilename += ".uncompress";
        FILE *fin = fopen(uncompressfilename.c_str(), "wb");
        assert(fin);
        //由压缩文件还原文件
        string compressfilename = filename;
        FILE *fout = fopen(compressfilename.c_str(), "rb");
        assert(fout);

        HuffmanTreeNode<CharInfo> *cur = root;
        int pos = 7;
        ch = fgetc(fout);
        while (charcount > 0)
        {
            while (cur)
            {
                if (cur->_left == NULL && cur->_right == NULL)
                {
                    //叶子结点
                    fputc(cur->_weight._ch, fin);
                    cur = root;
                    --charcount;
                    if (charcount == 0)   //所有的字符都处理完成
                        break;
                }
                if (ch & (1 << pos))    //检查字符的每个位
                    cur = cur->_right;    //1向右走
                else
                    cur = cur->_left;     //0向左走
                --pos;
                if (pos < 0)             //一个字节解压完成
                {
                    ch = fgetc(fout);
                    pos = 7;
                }
            }
        }
        fclose(foutconfig);
        fclose(fin);
        fclose(fout);
        return uncompressfilename;
    }
    //读取一行字符并放在line中
    bool ReadLine(FILE *fout, string& line)
    {
        int ch = fgetc(fout);
        if (ch == EOF)
            return false;
        while (ch != EOF && ch != '\n')
        {
            line += ch;
            ch = fgetc(fout);
        }
        return true;
    }
};

4，测试

#include"huffman_code.h"



void testFileCompress()
{
    FileCompress fc;
    fc.Compress("1.png");
    fc.UnCompress("1.png.compress");
}

int main()
{
    //testFileCompress1();
    testFileCompress();

    system("pause");
    return 0;
}

基于哈夫曼树的文件压缩

猜你喜欢