霍夫曼编码c++

2017-01-11

/*
 * huffman.c
 *  霍夫曼编码代码
 *  Created on: Dec 12, 2016
 *      Author: xuenhappy
 *
 *      sample:
 *
============
input file:
============

t 2
h 1
i 2
s 2
_ 7
a 4
n 2
e 4
x 1
m 2
p 1
l 1
o 1
f 3
H 1
u 1
r 1

==================================
output:
==================================

freq file:char_freq_map.dat
--------------------------------------
source H(x)=3.76975 bit
--------------------------------------
_	|7.00	|0.19444	|000
f	|3.00	|0.08333	|0010
m	|2.00	|0.05556	|0011
s	|2.00	|0.05556	|0100
t	|2.00	|0.05556	|0101
i	|2.00	|0.05556	|0110
n	|2.00	|0.05556	|0111
e	|4.00	|0.11111	|100
a	|4.00	|0.11111	|101
h	|1.00	|0.02778	|11000
r	|1.00	|0.02778	|11001
x	|1.00	|0.02778	|11010
u	|1.00	|0.02778	|11011
l	|1.00	|0.02778	|11100
H	|1.00	|0.02778	|11101
p	|1.00	|0.02778	|11110
o	|1.00	|0.02778	|11111
--------------------------------------
huffman avg code length=3.80556 bit

 *
 *
 */



#include <stdio.h>
#include <stdlib.h>
#include <map>
#include <regex.h>
#include <vector>
#include <string.h>
#include <string>
#include <algorithm>
#include <math.h>


typedef struct node{
	char* w;
	double freq;
	double max;
	struct node* f;
	struct node* n;
}H_NODE;

bool compare(node *a,node *b){
      return a->freq>b->freq;
}

void print_node(node* n,int code,int deep,double sum){
	if(n->w){
		printf("%s\t|%.2f\t|%.5f\t|",n->w,n->freq,n->freq/sum);
		for(int i=deep-1;i>=0;i--)
			printf("%d", (code&(1<<i)) != 0);
		printf("\n");
	}
	if(n->f)
		print_node(n->f,(code<<1)|0,deep+1,sum);
	if(n->n)
		print_node(n->n,(code<<1)|1,deep+1,sum);
}


void length_node(node* n,int deep,double f_sum,double &h_sum){
	if(n->f)
		length_node(n->f,deep+1,f_sum,h_sum);
	if(n->n)
		length_node(n->n,deep+1,f_sum,h_sum);
	if(n->w)
		h_sum+=deep*n->freq/f_sum;
}

void free_node(node* n){
	if(n->n)
		free_node(n->n);
	if(n->f)
		free_node(n->f);
	if(n->w){
		free(n->w);
	}
	free(n);
}


/**
 * 将指定频率的字母表转化为转化为对应的编码
 */
void huffman_encode(const std::map<std::string,double> &char_freqs){
	if(char_freqs.empty())
		return;

	//init
	H_NODE* nodes[char_freqs.size()];
	std::map<std::string,double>::const_iterator it;
	int i=0;
	double sum=0;
	for(it=char_freqs.begin();it!=char_freqs.end();it++){
		sum+=it->second;
	}
	double hx=0;
	for(it=char_freqs.begin();it!=char_freqs.end();it++){
		hx-=log(it->second/sum)*(it->second/sum);
	}
	hx=hx/log(2);
	printf("source H(x)=%.5f bit\n",hx);
	printf("--------------------------------------\n");
	for(it=char_freqs.begin();it!=char_freqs.end();it++){
		nodes[i]=(H_NODE*)malloc(sizeof(H_NODE));
		nodes[i]->freq=it->second;
		nodes[i]->max=it->second;
		nodes[i]->w=strdup(it->first.c_str());
		nodes[i]->f=nodes[i]->n=NULL;
		i++;
	}
	//build
	int size=char_freqs.size();
	while(size>1){
		std::sort(nodes,nodes+size, compare);
		H_NODE* c=(H_NODE*)malloc(sizeof(H_NODE));
		c->freq=nodes[size-1]->freq+nodes[size-2]->freq;
		c->w=NULL;
		if(nodes[size-2]->max>nodes[size-1]->max){
			c->max=nodes[size-2]->max;
			c->f=nodes[size-2];
			c->n=nodes[size-1];
		}else{
			c->max=nodes[size-1]->max;
			c->f=nodes[size-1];
			c->n=nodes[size-2];
		}
		nodes[size-1]=NULL;
		nodes[size-2]=c;
		size--;
	}
	//printf
	H_NODE* root=nodes[0];
	print_node(root,0,0,sum);
	printf("--------------------------------------\n");
	double sum_l=0;
	length_node(root,0,sum,sum_l);
	printf("huffman avg code length=%.5f bit\n",sum_l);
	//free
	free_node(root);
}

char* ctrim(char* c_str,size_t &c_len){
	//去掉头部的空格字符
	while(c_len>0&&isspace(*c_str) ){
		c_str++;
		c_len--;
	}
	char * start=c_str, *end = NULL;
	//去掉尾部
	while(c_len>0&&(*c_str != '\0')){
		if (!isspace(*c_str)){
			end = NULL;
			c_len--;
			c_str++;
			continue;
		}
		if(!end)end=c_str;
		c_len--;
		c_str++;
	}
	if(end){
		*end = '\0';
		c_len=end-start;
	}else{
		c_len=c_str-start;
	}
	return start;
}


void split(const char* input, regex_t &reg, std::vector<std::string> &out) {
	regmatch_t pm;
	while (!regexec(&reg, input, 1, &pm, 0)) {
		if(pm.rm_so>0)
			out.push_back(std::string(input, 0, pm.rm_so));
		input = input + (pm.rm_eo);
	}
	if ((*input) != '\0')
		out.push_back(input);
}

/**
 * 加载指定的频率文件
 */
int load_charc_seqs(const char* file,std::map<std::string,double> &char_freqs){
	FILE *fe=fopen(file,"r");
	if(!fe){
		printf("%s not exits or not read!\n",file);
		return EXIT_FAILURE;
	}
	printf("freq file:%s\n",file);
	printf("--------------------------------------\n");
	char buffer[1024];
	regex_t split_reg;
	regcomp(&split_reg, "\\s+", REG_EXTENDED);
	std::vector<std::string> strs;
	char* line;
	while(fgets(buffer,1024,fe)){
		strs.clear();
		size_t clen=strlen(buffer);
		line=ctrim(buffer, clen);
		if(clen<=0)
			continue;
		split(line, split_reg, strs);
		if(strs.size()!=2){
			printf("bad input line:%s\n",line);
			continue;
		}
		char_freqs[strs[0]]=atol(strs[1].c_str());
	}
	regfree(&split_reg);
	fclose(fe);
	return EXIT_SUCCESS;
}


int main(int argc, char **argv) {
	if(argc<2){
		printf("huffman_encode [charmap_freqs_file]\n",argv[0]);
		return EXIT_SUCCESS;
	}
	std::map<std::string,double> char_freqs;
	if(load_charc_seqs(argv[1], char_freqs)!=EXIT_SUCCESS){
		return EXIT_FAILURE;
	}
	huffman_encode(char_freqs);
	return EXIT_SUCCESS;
}








注明:本文章属于转载,仅供行业人员学习交流使用,文章版权属于原创作者,在此向原创者致敬,感谢原创作者为大家学习交流提供精品内容。

站方声明:IThao123是为广大互联网从业者免费提供学习交流的平台,如果侵犯了原创著作权,请联系站方删除,给你带来不便,深表歉意。

顶部