/* huffman.cpp * * (c) Copyright 1999 William A. McKee. All rights reserved. * * This program will analysis an input file (8 bits per value) and * generate the optimal Huffman encoding based on frequency of occurance. * * There is a compile time option (/DCOMPLETE_TREE) that will include * all 8 bit values even if they do not appear in the input file. Otherwise * the Huffman encoding will only include those codes with appear in the file. * * There is a special value for the end Of file (EOF) that is always included. * And it is contrived to have the Huffman encoded binary string of all ones. * * To compress a file, simply use the output table as a guide to substitute * the 8 bit values (plus the EOF value) with the Huffman encoded binary * strings. * * To decompress a file, reverse the process. You will notice that each binary * string is never the prefix of another binary string. This way you can * always tell how long the input binary string is by its value. When you * encounter a EOF value, you're done. * * A simple extension is the replacement of substring (dictionary * lookup) with special values that are then Huffman encoded. For example, * "\x0D\x0A" (CR+LF) sequences could have a special value since they almost * always appear together in MSDOS files. * * More values could be used in situations such as compressing a Double Byte * Character Set encoded file by mapping the DBCS codes to values greater than * 256. Or encoding UNICODE encoded text by using 16 bit values instead of 8 * bit values. These techniques would improve compression. * * In conclusion, the choice of values is highly dependent of the type of data * you plan to compress. The best way is to have a large sample of the data * and use frequency of occurance and common sense as a guide to your choices. * */ #include #include #include #include const int CHILD_NODE = -1; const int EOF_CODE = 256; const int NUMBER_OF_CODES = 257; class huffmanClass { private : int value; bool elim; int count; int level; huffmanClass * left, * right; public : huffmanClass (huffmanClass * l, huffmanClass * r); huffmanClass (int v) { value = v; #ifdef COMPLETE_TREE elim = false; #else // PARTIAL_TREE elim = true; #endif count = 0; level = 0; left = NULL; right = NULL; } ~huffmanClass () { if (left != NULL) delete left; if (right != NULL) delete right; } void eliminate () { elim = true; } void operator ++ (); friend int FindMinimum (huffmanClass * * huffman, int size); friend void PrintRecursive (huffmanClass * huffman, int level); void print () { printf (" Count Char Huffman Encoding\n"); printf (" ----- ---- ----------------\n"); PrintRecursive (this, 0); } bool AdjustmentForEOF () { if (value == EOF_CODE) return true; else if (value != CHILD_NODE) return false; else if (left->AdjustmentForEOF ()) { huffmanClass * t = left; left = right; right = t; return true; } else if (right->AdjustmentForEOF ()) return true; else return false; } }; huffmanClass::huffmanClass (huffmanClass * l, huffmanClass * r) { value = CHILD_NODE; elim = false; count = l->count + r->count; level = __max (l->level, r->level) + 1; left = l; right = r; } void huffmanClass::operator ++ () { elim = false; count ++; } void PrintRecursive (huffmanClass * huffman, int level) { static char buffer [1000]; if (level >= 1000) { printf ("\n"); return; } if (huffman->value != CHILD_NODE) { buffer [level] = '\0'; if (huffman->value == EOF_CODE) printf ("%10d: %s\n", huffman->count, buffer); else printf ("%10d: 0x%02X %s\n", huffman->count, huffman->value, buffer); } else { buffer [level] = '0'; PrintRecursive (huffman->left, level+1); buffer [level] = '1'; PrintRecursive (huffman->right, level+1); } } static const int NotFound = -1; int FindMinimum (huffmanClass * * huffman, int size) { int j = NotFound; int m = INT_MAX; int l = INT_MAX; for (int i = 0; i < size; i++) if (huffman [i] != NULL && ! huffman [i] -> elim && (huffman [i] -> count < m || huffman [i] -> count == m && huffman [i] -> level < l)) { j = i; m = huffman [i] -> count; l = huffman [i] -> level; } return j; } huffmanClass * BuildHuffman (huffmanClass * * huffman, int size) { int m1, m2; for (;;) { m1 = FindMinimum (huffman, size); if (m1 == NotFound) break; huffman [m1] -> eliminate (); m2 = FindMinimum (huffman, size); if (m2 == NotFound) break; huffman [m2] -> eliminate (); huffman [m1] = new huffmanClass (huffman [m1], huffman [m2]); huffman [m2] = NULL; } return m1 == NotFound ? NULL : huffman [m1]; } int main (int argc, char * argv []) { huffmanClass * huffman [NUMBER_OF_CODES]; int i; if (argc < 2) { fprintf (stderr, "usage: huffman ...\n"); exit (EXIT_FAILURE); } for (i = 0; i < NUMBER_OF_CODES; i++) huffman [i] = new huffmanClass (i); for (i = 1; i < argc; i++) { FILE * fp = fopen (argv [i], "rb"); if (fp == NULL) { fprintf (stderr, "error : unable to open input file '%s'.\n", argv [i]); exit (EXIT_FAILURE); } int ch; while ((ch = fgetc (fp)) != EOF) ++ (* huffman [ch]); fclose (fp); } ++ (* huffman [EOF_CODE]); // reserve one code for EOF huffmanClass * h = BuildHuffman (huffman, NUMBER_OF_CODES); assert (h != NULL); // by virtue of at least one (EOF) code being present. h -> AdjustmentForEOF (); h -> print (); delete h; return EXIT_SUCCESS; }