-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhuffman.c
134 lines (116 loc) · 3.52 KB
/
huffman.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <unistd.h>
#include <fcntl.h>
// We only use the "byte" data in leaf nodes, and each node has either
// two or no children. Thus, if "left" is null (no left child), then we know
// the node doesn't have a right child and can use "byte".
struct BST_Node {
struct BST_Node* left;
union {
struct BST_Node* right;
uint8_t byte;
};
};
struct Q_Node {
struct Q_Node* next;
struct BST_Node* bst;
uint64_t count;
};
void insert_Q_Node(struct Q_Node **iter, struct Q_Node *new_node)
{
while (*iter && (*iter)->count < new_node->count)
iter = &(*iter)->next;
new_node->next = *iter;
*iter = new_node;
}
double entropy(int64_t byte_count[256]);
void print_codes(struct BST_Node *node);
int main(int argc, uint8_t **argv)
{
if (argc < 2)
fprintf(stderr, "usage: ./huffman filename\n"), exit(1);
printf("opening and reading file\n");
int f = open(argv[1], O_RDONLY);
if (f == -1)
fprintf(stderr, "open failed\n"), exit(1);
uint64_t byte_count[256] = { 0 };
uint8_t input;
int bytes;
while ((bytes = read(f, &input, 1)) == 1)
byte_count[input]++;
close(f);
if (bytes < 0)
fprintf(stderr, "read failed\n"), exit(1);
printf("generating priority queue\n");
struct Q_Node *front = NULL;
for (int i = 0; i < 256; i++) {
if (byte_count[i]) {
struct Q_Node *new_node = calloc(sizeof(struct Q_Node), 1);
new_node->count = byte_count[i];
new_node->bst = calloc(sizeof(struct BST_Node), 1);
new_node->bst->byte = i;
insert_Q_Node(&front, new_node);
}
}
printf("transforming priority queue into huffman tree\n");
while (front->next) {
struct Q_Node *new_node = front;
struct BST_Node *new_bst = malloc(sizeof(struct BST_Node));
new_bst->left = new_node->next->bst;
new_bst->right = new_node->bst;
new_node->bst = new_bst;
new_node->count += new_node->next->count;
front = front->next->next;
free(new_node->next);
insert_Q_Node(&front, new_node);
}
struct BST_Node *huffman_tree = front->bst;
free(front);
printf("printing codes\n");
print_codes(huffman_tree);
printf("entropy: %f\n", (float)entropy(byte_count));
}
#include <ctype.h>
#define BIT_COUNT(x) (sizeof(x)*8)
void _print_codes(struct BST_Node *node, uint64_t code)
{
if (! node->left) {
// found leaf node
printf("0x%x %c\t", node->byte,
isprint(node->byte) ? node->byte : ' ');
uint64_t size = BIT_COUNT(code) - __builtin_ffsll(code);
code >>= __builtin_ffsll(code);
for (uint64_t i = 0; i < size; i++) {
printf("%d", code & 1);
code >>= 1;
}
printf("\n");
return;
}
code >>= 1;
_print_codes(node->left, code);
code |= 0x8000000000000000;
_print_codes(node->right, code);
}
inline void print_codes(struct BST_Node *node)
{
_print_codes(node, 0x8000000000000000);
}
#include <math.h>
// determines how well a huffman tree can compress some data
double entropy(int64_t byte_count[256])
{
int64_t total_bytes = 0;
// count bytes in file
for (int i = 0; i < 256; i++)
total_bytes += byte_count[i];
double e = 0;
for (int i = 0; i < 256; i++)
if (byte_count[i]) {
double p = (double)byte_count[i] / total_bytes;
e += p * (log(p) / log(2));
}
return -e;
}