目录
- 原理
- 类型定义
- 完整代码
- 实验
- 无重复数据的压缩情况
- 有重复数据的压缩情况
- 数据中只有一种字符的情况
原理
huffman统计数据中字符的出现次数,根据每个字符的出现次数来编码,出现次数越多的数据使用越短的编码长度,从而实现数据压缩的目的。
类型定义
定义Huffman树节点类型和Huffman结构体,由于一个字节最多可以表示256种数据,index_table和count_table长度最大设置为256即可满足所有数据的压缩。
typedef struct _huff_tree{uint8_t data;uint8_t pos;// 位置,左为1,右为0uint32_t count;struct _huff_tree *parant;struct _huff_tree *left;struct _huff_tree *right;
}huff_tree;typedef struct{huff_tree *tree;uint32_t index_table_index;huff_tree *index_table[256];uint32_t count_table[256];uint8_t *out;uint32_t out_len;const uint8_t *in;uint32_t in_len;uint32_t in_bit_count;uint32_t arr_bit_index;/* 以下成员调试时使用 */uint32_t tree_point_num;// 使用的树节点个数
}huffman_def;
完整代码
头文件定义如下:
#ifndef huffman_h__
#define huffman_h__#include "stdint.h"// huffman编码的实现
// out使用之后需要free内存
int hm_encode(const uint8_t *in,const int in_len,uint8_t **out,int *out_len);// huffman解码
// out使用之后需要free内存
int hm_decode(const uint8_t* in, const int in_len, uint8_t** out, int* out_len);#endif
源文件定义如下:
#include "stdlib.h"
#include "stdio.h"
#include "string.h"
#include "huffman_.h"
#include <stdlib.h>
// huffman编码的实现#define DBG_WARN printf
#define DBG_LOG printftypedef struct _huff_tree{uint8_t data;uint8_t pos;// 位置,左为1,右为0uint32_t count;struct _huff_tree *parant;struct _huff_tree *left;struct _huff_tree *right;
}huff_tree;typedef struct{huff_tree *tree;uint32_t index_table_index;huff_tree *index_table[256];uint32_t count_table[256];uint8_t *out;uint32_t out_len;const uint8_t *in;uint32_t in_len;uint32_t in_bit_count;uint32_t arr_bit_index;/* 以下成员调试时使用 */uint32_t tree_point_num;// 使用的树节点个数
}huffman_def;static int hm_calc_value_of_tree(huff_tree *t);
static int hm_calc_deep_of_child(huff_tree* t);// 生成一个树节点
static huff_tree *hm_creat_tree_point(huffman_def *h)
{h->tree_point_num++;return calloc(1,sizeof(huff_tree));
}// 删除一个树节点
static void hm_del_tree_point(huffman_def *h,huff_tree *t)
{if(h->tree_point_num>0){h->tree_point_num--;free(t);}
}// 按出现频次排序
static void hm_sort_index_table(huff_tree **table,int num)
{for(int i=0;i<num;i++){huff_tree *item=table[i];for (int j=i;j<num;j++){if(hm_calc_value_of_tree(table[j])>hm_calc_value_of_tree(item)){table[i]=table[j];table[j]=item;item=table[i];}}}
}// 打印index_table
static void hm_index_table_print(huffman_def *h){DBG_LOG("-----index_table-----\n");for(int i=0;i<h->index_table_index;i++){DBG_LOG("index:%d,data:%02x,count:%d\n",i,h->index_table[i]->data,h->index_table[i]->count);}
}// 打印数据的编码
static void hm_data_code_print(huffman_def *h){huff_tree *t;DBG_LOG("------data code------\n");for(int i=0;i<h->index_table_index;i++){t=h->index_table[i];DBG_LOG("%c:",t->data);while(t->parant){DBG_LOG("%d",t->pos);t=t->parant;}DBG_LOG("\n");}
}static void hm_calc_count(huffman_def *h,const uint8_t *d,const int d_len)
{int num = d_len;int index;memset(h->count_table,0,256);// DBG_LOG("calc count_table\n");for(int i=0;i<num;i++){h->count_table[d[i]]++;}// DBG_LOG("calc index_table\n");for(int i=0;i<256;i++){if(h->count_table[i]>0){index=h->index_table_index;h->index_table[index]=hm_creat_tree_point(h);h->index_table[index]->count=h->count_table[i];h->index_table[index]->data=i;h->index_table_index++;}}// DBG_LOG("sort index_table\n");hm_sort_index_table(h->index_table,h->index_table_index);// hm_index_table_print(h);
}// 计算编码后的长度
// 需要先计算index_table和生成huffman树
static int hm_calc_encode_len(huffman_def* h)
{// index_table_len(1byte)+index_data(index_table_len bytes)int sum =1+ h->index_table_index;int bit_count = 0;huff_tree* t;for (int i = 0; i < h->index_table_index; i++) {// 计数占用的字节数t = h->index_table[i];sum += t->count/255+1;// 压缩后占用的bit数bit_count += hm_calc_deep_of_child(t) * t->count;}// 补零数目字节sum += 1;sum += (bit_count + 7) / 8;DBG_LOG("data len for encode:%d\n", sum);return sum;
}// 计算树的值
static int hm_calc_value_of_tree(huff_tree *t)
{int sum=0;if(t->left&&t->right)sum=hm_calc_value_of_tree(t->left)+hm_calc_value_of_tree(t->right);elsesum=t->count;// DBG_LOG("tree sum:%d\n",sum);return sum;
}// 计算子节点的深度
static int hm_calc_deep_of_child(huff_tree* t)
{int deep = 0;while (t->parant) {deep++;t = t->parant;}return deep;
}// 打印huffman树
static void hm_tree_print(huff_tree *t)
{if(t->left&&t->right){DBG_LOG("point:,count:%d\n",hm_calc_value_of_tree(t));hm_tree_print(t->left);hm_tree_print(t->right);}else{DBG_LOG("data:%d,count:%d\n",t->data,t->count);}}// 建立huffman树
static void hm_creat_tree(huffman_def *h)
{int tail=h->index_table_index;huff_tree *sub1,*sub2;huff_tree **table=calloc(tail,sizeof(huff_tree *));for(int i=0;i<tail;i++){table[i]=h->index_table[i];}while(tail>1){huff_tree *temp;sub1=table[tail-1];sub2=table[tail-2];// 大在左,小在右temp=hm_creat_tree_point(h);sub1->parant=temp;sub2->parant=temp;// 左为1,右为0if(hm_calc_value_of_tree(sub1)>hm_calc_value_of_tree(sub2)){temp->left=sub1;sub1->pos=1;temp->right=sub2;sub2->pos=0;}else{temp->left=sub2;sub2->pos=1;temp->right=sub1;sub1->pos=0;}table[tail-2]=temp;tail--;hm_sort_index_table(table,tail);// DBG_LOG("-----table-----\n");// for(int i=0;i<tail;i++){// DBG_LOG("index:%d,count:%d\n",i,hm_calc_value_of_tree(table[i]));// }}h->tree=table[0];free(table);
}// 删除树
static void hm_del_tree(huffman_def *h,huff_tree *t)
{if(t->left&&t->right){hm_del_tree(h,t->left);hm_del_tree(h,t->right);}hm_del_tree_point(h,t);
}// 数据中添加一个bit
static void hm_add_bit(uint8_t *d,int *d_len,int bit,int *index)
{if(*index<(*d_len )*8){uint8_t c = d[*d_len - 1];c|=bit<<(*index%8);d[*d_len - 1] = c;}else{d[*d_len] = bit;(*d_len)++;}(*index)++;
}// 根据数据添加bit
static int hm_encode_byte(huffman_def *h,uint8_t d)
{huff_tree *t=0;// 这里默认一定能找到对应的值for(int i=0;i<h->index_table_index;i++){t=h->index_table[i];if(t->data==d)break;}if(t->data!=d){DBG_WARN("can not encode.\n");exit(-1);}while(t->parant){hm_add_bit(h->out,&h->out_len,t->pos,&h->arr_bit_index);t=t->parant;}return 0;
}// 生成索引
static int hm_creat_index_table(huffman_def *h,uint8_t *data,int *data_len)
{int temp;int diff;int temp_num;data[*data_len] = h->index_table_index; (*data_len)++;// hm_index_table_print(h);for(int i=0;i<h->index_table_index;i++){data[*data_len] = h->index_table[i]->data; (*data_len)++;temp=h->index_table[i]->count;temp_num = temp / 255 + 1;for (int i = 0; i < temp_num; i++) {if (i < temp_num - 1) {data[*data_len] = 255; (*data_len)++;}else {data[*data_len] = temp % 255; (*data_len)++;}}}// 填充0个数temp=8-(h->arr_bit_index%8);//DBG_LOG("fill with 0 by:%d\n", temp);data[*data_len] = temp; (*data_len)++;return 0;
}// huffman编码
/*
压缩后数据格式
data[0]:索引表长度
data[1 ~ n]:索引表,每个索引由值(1byte)和频次(1byte,小于255)(2byte,大于等于255,频次由两个字节相加)
data[n+1]:数据中填充0个数
data[n+2 ~ m]:压缩后的数据*/
int hm_encode(const uint8_t* in, const int in_len, uint8_t** out, int* out_len)
{int input_len = in_len;int output_len=0;int output_index = 0;huffman_def *h=calloc(1,sizeof(huffman_def));hm_calc_count(h,in, input_len);hm_creat_tree(h);DBG_LOG("huffman tree point num:%d\n",h->tree_point_num);output_len = hm_calc_encode_len(h);(*out) = calloc(output_len + 1, sizeof(uint8_t));hm_creat_index_table(h, *out, &output_index);DBG_LOG("output_len=%d\n", output_index);h->out = &(*out)[output_index];for(int i=0;i<input_len;i++){hm_encode_byte(h,in[i]);}DBG_LOG("bitcount:%d\n", h->arr_bit_index);(*out)[output_index-1] = h->out_len*8- h->arr_bit_index;DBG_LOG("fill with 0 by:%d\n", (*out)[output_index - 1]);(*out_len) = output_len;hm_del_tree(h,h->tree);DBG_LOG("after del tree point num:%d\n",h->tree_point_num);DBG_LOG("lenth_in:%d,length_encode:%d\n",input_len, output_len);free(h);return 0;
}// 读取编码表,返回数据开始的位置
static int hm_unpack_count(huffman_def *h,const uint8_t *d,int d_len)
{int num = d[0]==0?256:d[0];;int index=1;uint8_t temp;for(int i=0;i<num;i++){h->index_table[i]=hm_creat_tree_point(h);h->index_table[i]->data=d[index];index++;do{temp= d[index];index++;h->index_table[i]->count+=temp;}while(temp==0xff);h->index_table_index++;}temp= d[index];index++;h->in_bit_count=(d_len -index)*8-temp;h->in=&d[index];// hm_index_table_print(h);printf("bitcount:%d,\n",h->in_bit_count);return index;
}// 获取指定index的bit值
static inline int hm_get_bit(const uint8_t *d,int index)
{uint8_t t=d[index/8];return t&(1<<(index%8))?1:0;
}// 对比树节点,匹配返回bit数,不匹配返回0
static inline int hm_cmp_bits(huffman_def *h,huff_tree *t)
{int count=0;// DBG_LOG("tree pos:",t->pos);while(t){// DBG_LOG("%d",t->pos);if(hm_get_bit(h->in,h->arr_bit_index+count)!=t->pos){// DBG_LOG(" |failed\n");return 0;}else{count++;t=t->parant;}}h->arr_bit_index+=count;// DBG_LOG(" |ok,\n");return count;
}static uint8_t hm_decode_byte(huffman_def *h)
{huff_tree *t=h->tree;int bit;// DBG_LOG("decode:");while(t->left&&t->right){bit=hm_get_bit(h->in,h->arr_bit_index-1);// DBG_LOG("%d",bit);if(bit==t->left->pos)t=t->left;elset=t->right;h->arr_bit_index--;}// DBG_LOG(" | decode byte:%c\n",t->data);return t->data;
}static int hm_calc_decode_len(huffman_def *h)
{int sum=0;for(int i=0;i<h->index_table_index;i++){sum+=h->index_table[i]->count;}DBG_LOG("data len for decode:%d\n",sum);return sum;
}// huffman解码
/*
*/
int hm_decode(const uint8_t* in, const int in_len, uint8_t** out, int* out_len)
{int decode_len,decode_index;uint8_t *decode_data=0;uint8_t c;huffman_def *h=calloc(1,sizeof(huffman_def));if (h == 0) {return -1;}hm_unpack_count(h,in,in_len);hm_creat_tree(h);// hm_data_code_print(h);// hm_tree_print(h->tree);DBG_LOG("huffman tree point num:%d\n",h->tree_point_num);decode_len=hm_calc_decode_len(h);decode_index=decode_len;decode_data=calloc(decode_len+1,sizeof(uint8_t));h->arr_bit_index=h->in_bit_count;while(decode_index >0){c=hm_decode_byte(h);decode_data[decode_index-1]=c;decode_index--;}hm_del_tree(h,h->tree);DBG_LOG("after del tree point num:%d\n",h->tree_point_num);free(h);(*out) = decode_data;(*out_len) = decode_len;return 0;
}
实验
无重复数据的压缩情况
编写实验代码如下:
int main(int argc, char *argv[])
{// encode_file(argv[1]);//const uint8_t file_data[]="2023 5830628A000005830628A000015830628A000025830628A000035830628A000045830628A000055830628A000065830628A000075830628A000085830628A00009";const uint8_t file_data[]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70};uint8_t *encode_data=0;int encode_size;uint8_t *decode_data=0;int decode_size;hm_encode(file_data,sizeof(file_data),&encode_data,&encode_size);print_data(encode_data,encode_size);hm_decode(encode_data,encode_size,&decode_data,&decode_size);print_data(decode_data,decode_size);free(encode_data);free(decode_data);return 0;
}
可以看到在输入数据没有重复性的时候压缩之后的数据反而增大了(原始数据长度为70,压缩之后的数据长度为196),扩大了接近3倍。
有重复数据的压缩情况
编写验证代码如下:
int main(int argc, char *argv[])
{// encode_file(argv[1]);const uint8_t file_data[]="2023 5830628A000005830628A000015830628A000025830628A000035830628A000045830628A000055830628A000065830628A000075830628A000085830628A00009";// const uint8_t file_data[]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,// 39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70};uint8_t *encode_data=0;int encode_size;uint8_t *decode_data=0;int decode_size;hm_encode(file_data,sizeof(file_data),&encode_data,&encode_size);print_data(encode_data,encode_size);hm_decode(encode_data,encode_size,&decode_data,&decode_size);// print_data(decode_data,decode_size);printf("%s",(const char *)decode_data);free(encode_data);free(decode_data);return 0;
}
原始数据存在重复数据的时候,Huffman编码则可以大放异彩(原始数据长度136,压缩之后的数据长度76),数据量减小了接近一半。
数据中只有一种字符的情况
编写如下代码:
int main(int argc, char *argv[])
{// encode_file(argv[1]);// const uint8_t file_data[]="2023 5830628A000005830628A000015830628A000025830628A000035830628A000045830628A000055830628A000065830628A000075830628A000085830628A00009";// const uint8_t file_data[]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,// 39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70};uint8_t file_data[200]={0};uint8_t *encode_data=0;int encode_size;uint8_t *decode_data=0;int decode_size;hm_encode(file_data,sizeof(file_data),&encode_data,&encode_size);print_data(encode_data,encode_size);hm_decode(encode_data,encode_size,&decode_data,&decode_size);print_data(decode_data,decode_size);// printf("%s",(const char *)decode_data);free(encode_data);free(decode_data);return 0;
}
原始数据有200字节,压缩之后仅4个字节,[0x01,0x00,0xc8,0x00]。
项目 | Value |
---|---|
0x01 | 压缩数据中出现的字符数 |
0x00 | 压缩数据中出现的第一个字符 |
0xc8 | 第一个字符的出现次数 |
0x00 | 没有编码末尾补0 |