这几天在较为认真的研究基于哈夫曼编码的文件压缩及解压,费了点时间,在这分享一下:
这里用链式结构,非顺序表结构;
文件压缩:
1.获取文件信息(这里采用TXT格式文本);
2.压缩文件;
3.写配置文件(便于解压时用,无非就是存放原文件的索引之类的,比如说,文件中某个字符出现的个数,记录下来)
4.解压缩,使用压缩后的文件和配置文件解压文件;
5.用比对软件,比对解压后的文件和源文件是否相同;
下面慢慢解析:
先看一个文件信息类:
typedef long long LongType;
struct FileInfo
{unsigned char _ch; //字符LongType _count; //字符出现次数string _code; //字符对应的哈夫曼编码 FileInfo(unsigned char ch = 0):_ch(ch),_count(0){}FileInfo operator+(const FileInfo& x){FileInfo tmp;tmp._count = this->_count + x._count;return tmp;}bool operator !=(const FileInfo& x) const{return this->_count != x._count;}
};bool operator<(const FileInfo info1,const FileInfo info2)
{return info1._count < info2._count;
}
此为一个文件信息的类结构,包含字符,字符对应出现的次数,以及这个字符对应的哈夫曼编码(能看到这篇博客的星弟,对哈夫曼编码不会陌生,这里不再强调) 除了统计字符出现的次数及哈夫曼编码,还完成了几个运算符的重载
要获取哈夫曼编码,就得建立哈夫曼树,建立哈夫曼树用最小堆取操作,以下是最小堆建立过程
// 小堆
template<class T>
struct Less
{bool operator() (const T& l, const T& r){return l < r; // operator<}};template<class T>
struct Greater
{bool operator() (const T& l, const T& r){return l > r; // operator<}
};template<class T, class Compare = Less<T>>
class Heap
{
public:Heap(){}Heap(const T* a, size_t size){for (size_t i = 0; i < size; ++i){_arrays.push_back(a[i]);}// 建堆for(int i = (_arrays.size()-2)/2; i >= 0; --i){AdjustDown(i);}}void Push(const T& x){_arrays.push_back(x);AdjustUp(_arrays.size()-1);}void Pop(){assert(_arrays.size() > 0);swap(_arrays[0], _arrays[_arrays.size() - 1]);_arrays.pop_back();AdjustDown(0);}T& Top(){assert(_arrays.size() > 0);return _arrays[0];}bool Empty(){return _arrays.empty();}int Size(){return _arrays.size();}void AdjustDown(int root){int child = root*2 + 1;// Compare com;while (child < _arrays.size()){// 比较出左右孩子中小的那个if (child+1<_arrays.size() &&*_arrays[child+1] < _arrays[child])//if(child+1<_arrays.size() &&// com(_arrays[child+1],_arrays[child])){++child;}if(*_arrays[child] < _arrays[root])//if(com(_arrays[child],_arrays[root])){swap(_arrays[child], _arrays[root]);root = child;child = 2*root+1;}else{break;}}}void AdjustUp(int child){int parent = (child-1)/2;//while (parent >= 0)while (child > 0){if (*_arrays[child] < _arrays[parent]){swap(_arrays[parent], _arrays[child]);child = parent;parent = (child-1)/2;}else{break;}}}public:vector<T> _arrays;
};
最小堆里也完成了很多接口,包括push pop等 然后就是几个压缩和解压的函数接口
1.根据哈夫曼树获取哈夫曼变慢:
void _GenerateHuffmanCode(HuffmanTreeNode<FileInfo>* root){if (root == nullptr){return;}_GenerateHuffmanCode(root->_left);_GenerateHuffmanCode(root->_right);//当前节点为叶子节点为空 才生成哈夫曼编码if (root->_left == nullptr && root->_right == nullptr){HuffmanTreeNode<FileInfo>* cur = root;HuffmanTreeNode<FileInfo>* parent = cur->_parent;string& code = _infos[cur->_weight._ch]._code;while (parent){if (parent->_left == cur){code += '1';}else if (parent->_right == cur){code += '0';}cur = parent;parent = cur->_parent;}reverse(code.begin(), code.end());}}
void CreateTree(T *a, size_t size, const T& invalid){assert(a);Heap<HuffmanTreeNode<T>*> s1; //草 终于发现问题 在这里 (堆里放的是指针,类型一定要对)//找两个最小的元素for (size_t i = 0; i < size; ++i){if (a[i] != invalid){HuffmanTreeNode<T>* node = new HuffmanTreeNode<T>(a[i]);s1.Push(node);}}while (s1.Size() > 1){HuffmanTreeNode<T>* left = s1.Top();s1.Pop();HuffmanTreeNode<T>* right = s1.Top();s1.Pop();HuffmanTreeNode<T>* parent = new HuffmanTreeNode<T>(left->_weight + right->_weight);parent->_left = left;parent->_right = right;left->_parent = parent;right->_parent = parent;s1.Push(parent);}_root = s1.Top();s1.Pop();}
bool _ReadLine(FILE *fOutLogFile, string& line){char ch = fgetc(fOutLogFile);if (feof(fOutLogFile))return false;else{if (ch == '\n'){line += ch;ch = fgetc(fOutLogFile);}while (ch != '\n'){line += ch;ch = fgetc(fOutLogFile);}return true;}}
4.文件压缩
//文件压缩bool Compress(const char* filename){//1.打开一个文件,统计文件字符出现的次数//2.生成对应的哈弗曼编码//3.压缩文件//4.写配置文件,方便解压缩assert(filename);FILE *fOut = fopen(filename, "rb");assert(fOut);//统计文件字符出现的次数unsigned char ch = fgetc(fOut);while (!feof(fOut)) //文件结束{_infos[ch]._count++;ch = fgetc(fOut);}HuffmanTree<FileInfo> ht;FileInfo invalid;ht.CreateTree(_infos, 256, invalid);//哈夫曼编码_GenerateHuffmanCode(ht.GetRoot());string compressFile = filename;compressFile += ".huf";//压缩后的文件名 后缀为《输入文件名+.huf》FILE *finCompress = fopen(compressFile.c_str(), "wb"); //获取string中的C字符串assert(finCompress);fseek(fOut, 0, SEEK_SET);//将文件指针移到开头char cha = fgetc(fOut);unsigned char inch = 0;int index = 0; //一个字节的八位while (!feof(fOut)){string& code = _infos[(unsigned char)cha]._code;for (size_t i = 0; i < code.size(); ++i){inch <<= 1; //低位向高位进if (code[i] == '1'){inch |= 1;}if (++index == 8){fputc(inch, finCompress); //够8位,装进文件index = 0; //重新一轮开始inch = 0;}}cha = fgetc(fOut);}fclose(fOut);//如果index = 0 说明 上边8位刚好存满 不等 下一个自己又出来了if (index != 0) //处理最后一个字符不够的问题{inch <<= (8 - index); //最高位必须装上 后边的浪费掉fputc(inch, finCompress);}fclose(finCompress);}
5.写配置文件:
string logFile = filename;logFile += ".log";FILE *Log = fopen(logFile.c_str(), "wb");assert(Log);string chInfo;char str[128] = {0}; //没空间 不可以for (size_t i = 1; i < 256; ++i){if (_infos[i]._count > 0){chInfo += _infos[i]._ch;chInfo += ',';chInfo += _itoa(_infos[i]._count,str,10);chInfo += '\n';fputs(chInfo.c_str(), Log);chInfo.clear();}}fclose(Log);
6.最后的文件解压:
//重构文件void _RestoreFiles(HuffmanTreeNode<FileInfo> *root, const char* Fileneme,long long size){assert(root);//原压缩文件string name = Fileneme;name += ".huf";FILE* Out = fopen(name.c_str(),"rb");assert(Out);string restorefilename = Fileneme;restorefilename += ".over";FILE *over = fopen(restorefilename.c_str(),"wb");assert(over);int pos = 8;long long poss = size;unsigned char chz = fgetc(Out);while (poss>0){HuffmanTreeNode<FileInfo>* cur = nullptr;cur = root;while (cur->_left != nullptr || cur->_right != nullptr){pos--;unsigned char temp = chz >> pos;int ch = 1 & temp;if (ch == 0){cur = cur->_right;}else if (ch == 1){cur = cur->_left;}if (pos == 0){chz = fgetc(Out);pos = 8;}}fputc(cur->_weight._ch, over);poss--;}fclose(Out);fclose(over);}void UnCompress(const char* Fileneme)//解压缩{//1.打开日志文件//2.根据信息还原哈夫曼树//3.还原信息;string UnCompressneme = Fileneme;UnCompressneme += ".log";FILE *fOutLogFile = fopen(UnCompressneme.c_str(), "rb");assert(fOutLogFile);string line;while (_ReadLine(fOutLogFile, line)){unsigned char ch = line[0];_infos[ch]._count = atoi(line.substr(2).c_str());line.clear();} HuffmanTree<FileInfo> f;FileInfo invalid;f.CreateTree(_infos, 256, invalid);//根据重建的哈夫曼树 还原文件;long long size = f.GetRoot()->_weight._count;_RestoreFiles(f.GetRoot(), Fileneme,size);}
到此,此项目基本完成;如遇问题,希望留言,随时解答,如有见解,跪求赐教!