diff options
Diffstat (limited to 'baseline/source/huff_dec')
-rw-r--r-- | baseline/source/huff_dec/ChangeLog.txt | 20 | ||||
-rw-r--r-- | baseline/source/huff_dec/compress.txt | 1107 | ||||
-rw-r--r-- | baseline/source/huff_dec/huff_dec.c | 393 |
3 files changed, 1520 insertions, 0 deletions
diff --git a/baseline/source/huff_dec/ChangeLog.txt b/baseline/source/huff_dec/ChangeLog.txt new file mode 100644 index 0000000..865c263 --- /dev/null +++ b/baseline/source/huff_dec/ChangeLog.txt | |||
@@ -0,0 +1,20 @@ | |||
1 | File: huff_dec.c | ||
2 | Original provenience: David Bourgin (David.Bourgin@ufrima.imag.fr) | ||
3 | |||
4 | 2016-03-23: | ||
5 | - Replaced dynamic memory allocation by a fixed array with 514 entries | ||
6 | - Replaced memset() with loops | ||
7 | - Replaced file I/O by reading and writing to char arrays | ||
8 | - Added huff_dec_main(), huff_dec_init() and huff_dec_return() | ||
9 | - Added huff_dec_ prefix to all functions, types and global variables | ||
10 | - Changed function arguments to ANSI style | ||
11 | - Replaced macro definitions | ||
12 | - Added generic TACLeBench header replacing previous header | ||
13 | - Included license from compress.txt | ||
14 | - Corrected loop bounds in huff_dec_read_header() | ||
15 | |||
16 | 2016-05-24: | ||
17 | - static removed and huff_dec prefix added | ||
18 | |||
19 | 2016-05-25: | ||
20 | - added const to void g++ warning | ||
diff --git a/baseline/source/huff_dec/compress.txt b/baseline/source/huff_dec/compress.txt new file mode 100644 index 0000000..5966bcf --- /dev/null +++ b/baseline/source/huff_dec/compress.txt | |||
@@ -0,0 +1,1107 @@ | |||
1 | +===========================================================+ | ||
2 | | Introduction to the losslessy compression schemes | | ||
3 | | Description of the codec source codes | | ||
4 | +-----------------------------------------------------------+ | ||
5 | | From David Bourgin (E-mail: david.bourgin@ufrima.imag.fr) | | ||
6 | | Date: 22/9/94 | | ||
7 | +===========================================================+ | ||
8 | |||
9 | ------ BE CARE ------ | ||
10 | This file (compress.txt) is copyrighted. (c) David Bourgin - 1994 | ||
11 | Permission to use this documentation for any purpose other than | ||
12 | its incorporation into a commercial product is hereby granted without fee. | ||
13 | Permission to copy and distribute this documentation only for non-commercial use | ||
14 | is also granted without fee, provided, however, that the above copyright notice | ||
15 | appears in all copies, that both that copyright notice and this permission notice appear in supporting documentation. The author makes no representations about | ||
16 | the suitability of this documentation for any purpose. It is provided "as is" | ||
17 | without express or implied warranty. | ||
18 | |||
19 | The source codes you obtain with this file are *NOT* covered by the same | ||
20 | copyright, because you can include them for both commercial and non-commercial | ||
21 | use. See below for more infos. | ||
22 | |||
23 | The source code files (codrl1.c, dcodrl1.c, codrle2.c, dcodrle2.c, codrle3.c, | ||
24 | dcodrle3.c, codrle4.c, dcodrle4.c, codhuff.c, dcodhuff.c) are copyrighted. | ||
25 | They have been uploaded on ftp in turing.imag.fr (129.88.31.7):/pub/compression | ||
26 | on 22/5/94 and have been modified on 22/9/94. | ||
27 | (c) David Bourgin - 1994 | ||
28 | The source codes I provide have no buggs (!) but being that I make them | ||
29 | available for free I have some notes to make. They can change at any time | ||
30 | without notice. I assume no responsability or liability for any errors or | ||
31 | inaccurracies, make no warranty of any kind (express, implied or statutory) | ||
32 | with respect to this publication and expressly disclaim any and all warranties | ||
33 | of merchantability, fitness for particular purposes. Of course, if you have | ||
34 | some problems to use the information presented here, I will try to help you if | ||
35 | I can. | ||
36 | |||
37 | If you include the source codes in your application, here are the conditions: | ||
38 | - You have to put my name in the header of your source file (not in the | ||
39 | excutable program if you don't want) (this item is a must) | ||
40 | - I would like to see your resulting application, if possible (this item is not | ||
41 | a must, because some applications must remain secret) | ||
42 | - Whenever you gain money with your application, I would like to receive a very | ||
43 | little part in order to be encouraged to update my source codes and to develop | ||
44 | new schemes (this item is not a must) | ||
45 | --------------------- | ||
46 | |||
47 | There are several means to compress data. Here, we are only going to deal with | ||
48 | the losslessy schemes. These schemes are also called non-destructive because | ||
49 | you always recover the initial data you had, and this, as soon as you need them. | ||
50 | With losslessy schemes, you won't never lose any informations (except perhaps | ||
51 | when you store or transmit your data but this is another problem...). | ||
52 | |||
53 | In this introduction, we are going to see: | ||
54 | - The RLE scheme (with different possible algorithms) | ||
55 | - The Huffman schemes (dynamical scheme) | ||
56 | - And the LZW scheme | ||
57 | |||
58 | For the novice, a compresser is a program able to read several data (e.g. bytes) | ||
59 | in input and to write several data in output. The data you obtain from the | ||
60 | output (also called compressed data) will - of course - take less space than | ||
61 | the the input data. This is true in most of cases, if the compresser works | ||
62 | and if the type of the data is correct to be compressed with the given scheme. | ||
63 | The codec (coder-decoder) enables you to save space on your hard disk and/or | ||
64 | to save the communication costs because you always store/transmit the compressed | ||
65 | data. You'll use the decompresser as soon as you need to recover your initial | ||
66 | useful data. Note that the compressed data are useless if you have not | ||
67 | the decoder... | ||
68 | |||
69 | You are doubtless asking "How can I reduce the data size without losing some | ||
70 | informations?". It's easy to answer to this question. I'll only take an example. | ||
71 | I'm sure you have heard about the morse. This system established in the 19th | ||
72 | century use a scheme very close to the huffman one. In the morse you encode | ||
73 | the letters to transmit with two kinds of signs. If you encode these two sign | ||
74 | possibilities in one bit, the symbol 'e' is transmitted in a single bit and | ||
75 | the symbols 'y' and 'z' need four bits. Look at the symbols in the text you are | ||
76 | reading, you'll fast understand the compression ratio... | ||
77 | |||
78 | Important: The source codes associated to the algorithms I present are | ||
79 | completely adaptative on what you need to compress. They all use basical | ||
80 | macros on the top of the file. Usually the macros to change are: | ||
81 | |||
82 | - beginning_of_data | ||
83 | - end_of_data | ||
84 | - read_byte | ||
85 | - read_block | ||
86 | - write_byte | ||
87 | - write_block | ||
88 | |||
89 | These allow the programmer to modify only a little part of the header | ||
90 | of the source codes in order to compress as well memory as files. | ||
91 | |||
92 | beginning_of_data(): Macro used to set the program so that the next read_byte() | ||
93 | call will read the first byte to compress. | ||
94 | end_of_data(): Returns a boolean to know whether there is no more bytes to read | ||
95 | from the input stream. Return 0 if there is no more byte to compress, another | ||
96 | non-zero value otherwise. | ||
97 | read_byte(): Returns a byte read from the input stream if available. | ||
98 | write_byte(x): Writes the byte 'x' to the output stream. | ||
99 | read_block(...) and write_block(...): Same use as read_byte and write_byte(x) | ||
100 | but these macros work on blocks of bytes and not only on a single byte. | ||
101 | |||
102 | If you want to compress *from* the memory, before entering in a xxxcoding | ||
103 | procedure ('xxx' is the actual extension to replace with a given codec), you | ||
104 | have to add a pointer set up to the beginning of the zone to compress. Note | ||
105 | that the following pointer 'source_memory_base' is not to add, it is just given | ||
106 | here to specify a name to the address of the memory zone you are going to | ||
107 | encode or decode. That is the same about source_memory_end which can be either | ||
108 | a pointer to create or an existing pointer. | ||
109 | |||
110 | unsigned char *source_memory_base, /* Base of the source memory */ | ||
111 | *source_memory_end, /* Last address to read. | ||
112 | source_memory_end=source_memory_base+source_zone_length-1 */ | ||
113 | *source_ptr; /* Used in the xxxcoding procedure */ | ||
114 | void pre_start() | ||
115 | { source_ptr=source_memory_base; | ||
116 | xxxcoding(); | ||
117 | } | ||
118 | |||
119 | end_of_data() and read_byte() are also to modify to compress *from* memory: | ||
120 | |||
121 | #define end_of_data() (source_ptr>source_memory_end) | ||
122 | #define read_byte() (*(source_ptr++)) | ||
123 | |||
124 | If you want to compress *to* memory, before entering in a xxxcoding procedure | ||
125 | ('xxx' is the actual extension to replace with a given codec), you have to add | ||
126 | a pointer. Note that the pointer 'dest_memory_base' is not to add, it is just | ||
127 | given there to specify the address of the destination memory zone you are | ||
128 | going to encode or decode. | ||
129 | |||
130 | unsigned char *dest_memory_base, /* Base of the destination memory */ | ||
131 | *dest_ptr; /* Used in the xxxcoding procedure */ | ||
132 | void pre_start() | ||
133 | { dest_ptr=dest_memory_base; | ||
134 | xxxcoding(); | ||
135 | } | ||
136 | |||
137 | Of course, you can combine both from and to memory in the pre_start() procedure. | ||
138 | The files dest_file and source_file handled in the main() function are | ||
139 | to remove... | ||
140 | |||
141 | void pre_start() | ||
142 | { source_ptr=source_memory_base; | ||
143 | dest_ptr=dest_memory_base; | ||
144 | xxxcoding(); | ||
145 | } | ||
146 | |||
147 | In fact, to write to memory, the problem is in the write_byte(x) procedure. | ||
148 | This problem exists because your destination zone can either be a static | ||
149 | zone or a dynamically allocated zone. In the two cases, you have to check | ||
150 | if there is no overflow, especially if the coder is not efficient and must | ||
151 | produce more bytes than you reserved in memory. | ||
152 | |||
153 | In the first case, with a *static* zone, write_byte(x) macro should look like | ||
154 | that: | ||
155 | |||
156 | unsigned long int dest_zone_length, | ||
157 | current_size; | ||
158 | |||
159 | #define write_byte(x) { if (current_size==dest_zone_length) \ | ||
160 | exit(1); \ | ||
161 | dest_ptr[current_size++]=(unsigned char)(x); \ | ||
162 | } | ||
163 | |||
164 | In the static version, the pre_start() procedure is to modify as following: | ||
165 | |||
166 | void pre_start() | ||
167 | { source_ptr=source_memory_base; | ||
168 | dest_ptr=dest_memory_base; | ||
169 | dest_zone_length=...; /* Set up to the actual destination zone length */ | ||
170 | current_size=0; /* Number of written bytes */ | ||
171 | xxxcoding(); | ||
172 | } | ||
173 | Otherwise, dest_ptr is a zone created by the malloc instruction and you can try | ||
174 | to resize the allocated zone with the realloc instruction. Note that I increment | ||
175 | the zone one kilo-bytes by one kylo-bytes. You have to add two other variables: | ||
176 | |||
177 | unsigned long int dest_zone_length, | ||
178 | current_size; | ||
179 | |||
180 | #define write_byte(x) { if (current_size==dest_zone_length) \ | ||
181 | { dest_zone_length += 1024; \ | ||
182 | if ((dest_ptr=(unsigned char *)realloc(dest_ptr,dest_zone_length*sizeof(unsigned char)))==NULL) \ | ||
183 | exit(1); /* You can't compress in memory \ | ||
184 | => I exit but *you* can make a routine to swap on disk */ \ | ||
185 | } \ | ||
186 | dest_ptr[current_size++]=(unsigned char)(x); \ | ||
187 | } | ||
188 | |||
189 | With the dynamically allocated version, change the pre_start() routine as following: | ||
190 | |||
191 | void pre_start() | ||
192 | { source_ptr=source_memory_base; | ||
193 | dest_ptr=dest_memory_base; | ||
194 | dest_zone_length=1024; | ||
195 | if ((dest_ptr=(unsigned char *)malloc(dest_zone_length*sizeof(unsigned char)))==NULL) | ||
196 | exit(1); /* You need at least 1 kb in the dynamical memory ! */ | ||
197 | current_size=0; /* Number of written bytes */ | ||
198 | xxxcoding(); | ||
199 | /* Handle the bytes in dest_ptr but don't forget to free these bytes with: | ||
200 | free(dest_ptr); | ||
201 | */ | ||
202 | } | ||
203 | |||
204 | The previously given macros work as: | ||
205 | |||
206 | void demo() /* The file opening, closing and variables | ||
207 | must be set up by the calling procedure */ | ||
208 | { unsigned char byte; | ||
209 | /* And not 'char byte' (!) */ | ||
210 | while (!end_of_data()) | ||
211 | { byte=read_byte(); | ||
212 | printf("Byte read=%c\n",byte); | ||
213 | } | ||
214 | } | ||
215 | |||
216 | You must not change the rest of the program unless you're really sure and | ||
217 | really need to do it! | ||
218 | |||
219 | +==========================================================+ | ||
220 | | The RLE encoding | | ||
221 | +==========================================================+ | ||
222 | |||
223 | RLE is an acronym that stands for Run Length Encoding. You may encounter it | ||
224 | as an other acronym: RLC, Run Length Coding. | ||
225 | |||
226 | The idea in this scheme is to recode your data with regard to the repetition | ||
227 | frames. A frame is one or more bytes that occurr one or several times. | ||
228 | |||
229 | There are several means to encode occurrences. So, you'll have several codecs. | ||
230 | For example, you may have a sequence such as: | ||
231 | 0,0,0,0,0,0,255,255,255,2,3,4,2,3,4,5,8,11 | ||
232 | |||
233 | Some codecs will only deal with the repetitions of '0' and '255' but some other | ||
234 | will deal with the repetitions of '0', '255', and '2,3,4'. | ||
235 | |||
236 | You have to keep in your mind something important based on this example. A codec | ||
237 | won't work on all the data you will try to compress. So, in case of non | ||
238 | existence of sequence repetitions, the codecs based on RLE schemes must not | ||
239 | display a message to say: "Bye bye". Actually, they will try to encode these | ||
240 | non repeted data with a value that says "Sorry, I only make a copy of the inital | ||
241 | input". Of course, a copy of the input data with an header in front of this copy | ||
242 | will make a biggest output data but if you consider the whole data to compress, | ||
243 | the encoding of repeated frames will take less space than the encoding | ||
244 | of non-repeated frames. | ||
245 | |||
246 | All of the algorithms with the name of RLE have the following look with three | ||
247 | or four values: | ||
248 | - Value saying if there's a repetition | ||
249 | - Value saying how many repetitions (or non repetition) | ||
250 | - Value of the length of the frame (useless if you just encode frame | ||
251 | with one byte as maximum length) | ||
252 | - Value of the frame to repeat (or not) | ||
253 | |||
254 | I gave four algorithms to explain what I say. | ||
255 | |||
256 | *** First RLE scheme *** | ||
257 | |||
258 | The first scheme is the simpliest I know, and looks like the one used in MAC | ||
259 | system (MacPackBit) and some image file formats such as Targa, PCX, TIFF, ... | ||
260 | |||
261 | Here, all compressed blocks begin with a byte, named header, which description | ||
262 | is: | ||
263 | |||
264 | Bits 7 6 5 4 3 2 1 0 | ||
265 | Header X X X X X X X X | ||
266 | |||
267 | Bits 7: Compression status (1=Compression applied) | ||
268 | 0 to 6: Number of bytes to handle | ||
269 | |||
270 | So, if the bit 7 is set up to 0, the 0 to 6 bits give the number of bytes | ||
271 | that follow (minus 1, to gain more over compress) and that were not compressed | ||
272 | (native bytes). If the bit 7 is set up to 1, the same 0 to 6 bits give | ||
273 | the number of repetition (minus 2) of the following byte. | ||
274 | |||
275 | As you see, this method only handle frame with one byte. | ||
276 | |||
277 | Additional note: You have 'minus 1' for non-repeated frames because you must | ||
278 | have at least one byte to compress and 'minus 2' for repeated frames because the | ||
279 | repetition must be 2, at least. | ||
280 | |||
281 | Compression scheme: | ||
282 | |||
283 | First byte=Next | ||
284 | /\ | ||
285 | / \ | ||
286 | Count the byte Count the occurrence of NON identical | ||
287 | occurrences bytes (maximum 128 times) | ||
288 | (maximum 129 times) and store them in an array | ||
289 | | | | ||
290 | | | | ||
291 | 1 bit '1' 1 bit '0' | ||
292 | + 7 bits giving + 7 bits giving | ||
293 | the number (-2) the number (-1) | ||
294 | of repetitions of non repetition | ||
295 | + repeated byte + n non repeated bytes | ||
296 | | | | ||
297 | 1xxxxxxx,yyyyyyyy 0xxxxxxx,n bytes | ||
298 | [-----------------] [----------------] | ||
299 | |||
300 | Example: | ||
301 | |||
302 | Sequence of bytes to encode | Coded values | Differences with compression | ||
303 | | | (unit: byte) | ||
304 | ------------------------------------------------------------------------- | ||
305 | 255,15, | 1,255,15, | -1 | ||
306 | 255,255, | 128,255, | 0 | ||
307 | 15,15, | 128,15, | 0 | ||
308 | 255,255,255, | 129,255, | +1 | ||
309 | 15,15,15, | 129,15, | +1 | ||
310 | 255,255,255,255, | 130,255, | +2 | ||
311 | 15,15,15,15 | 130,15 | +2 | ||
312 | |||
313 | See codecs source codes: codrle1.c and dcodrle1.c | ||
314 | |||
315 | *** Second RLE scheme *** | ||
316 | |||
317 | In the second scheme of RLE compression you look for the less frequent byte | ||
318 | in the source to compress and use it as an header for all compressed block. | ||
319 | |||
320 | In the best cases, the occurrence of this byte is zero in the data to compress. | ||
321 | |||
322 | Two possible schemes, firstly with handling frames with only one byte, | ||
323 | secondly with handling frames with one byte *and* more. The first case is | ||
324 | the subject of this current compression scheme, the second is the subject | ||
325 | of next compression scheme. | ||
326 | |||
327 | For the frame of one byte, header byte is written in front of all repetition | ||
328 | with at least 4 bytes. It is then followed by the repetition number minus 1 and | ||
329 | the repeated byte. | ||
330 | Header byte, Occurrence number-1, repeated byte | ||
331 | |||
332 | If a byte don't repeat more than tree times, the three bytes are written without | ||
333 | changes in the destination stream (no header nor length, nor repetition in front | ||
334 | or after theses bytes). | ||
335 | |||
336 | An exception: If the header byte appears in the source one, two, three and up | ||
337 | times, it'll be respectively encoded as following: | ||
338 | - Header byte, 0 | ||
339 | - Header byte, 1 | ||
340 | - Header byte, 2 | ||
341 | - Header byte, Occurrence number-1, Header byte | ||
342 | |||
343 | Example, let's take the previous example. A non frequent byte is zero-ASCII | ||
344 | because it never appears. | ||
345 | |||
346 | Sequence of bytes to encode | Coded values | Differences with compression | ||
347 | | | (unit: byte) | ||
348 | ------------------------------------------------------------------------- | ||
349 | 255,15, | 255,15, | -1 | ||
350 | 255,255, | 255,255, | 0 | ||
351 | 15,15, | 15,15, | 0 | ||
352 | 255,255,255, | 255,255,255, | 0 | ||
353 | 15,15,15, | 15,15,15, | 0 | ||
354 | 255,255,255,255, | 0,3,255, | -1 | ||
355 | 15,15,15,15 | 0,3,15 | -1 | ||
356 | |||
357 | If the header would appear, we would see: | ||
358 | |||
359 | Sequence of bytes to encode | Coded values | Differences with compression | ||
360 | | | (unit: byte) | ||
361 | ------------------------------------------------------------------------- | ||
362 | 0, | 0,0, | +1 | ||
363 | 255, | 255, | 0 | ||
364 | 0,0, | 0,1, | 0 | ||
365 | 15, | 15, | 0 | ||
366 | 0,0,0, | 0,2, | -1 | ||
367 | 255, | 255, | 0 | ||
368 | 0,0,0,0 | 0,3,0 | -1 | ||
369 | |||
370 | See codecs source codes: codrle2.c and dcodrle2.c | ||
371 | |||
372 | *** Third RLE scheme *** | ||
373 | |||
374 | It's the same idea as the second scheme but we can encode frames with | ||
375 | more than one byte. So we have three cases: | ||
376 | |||
377 | - If it was the header byte, whatever is its occurrence, you encode it with: | ||
378 | Header byte,0,number of occurrence-1 | ||
379 | - For frames which (repetition-1)*length>3, encode it as: | ||
380 | Header byte, Number of frame repetition-1, frame length-1,bytes of frame | ||
381 | - If no previous cases were detected, you write them as originally (no header, | ||
382 | nor length, nor repetition in front or after theses bytes). | ||
383 | |||
384 | Example based on the previous examples: | ||
385 | |||
386 | Sequence of bytes to encode | Coded values | Differences with compression | ||
387 | | | (unit: byte) | ||
388 | ----------------------------------------------------------------------------- | ||
389 | 255,15, | 255,15, | 0 | ||
390 | 255,255, | 255,255, | 0 | ||
391 | 15,15, | 15,15, | 0 | ||
392 | 255,255,255, | 255,255,255, | 0 | ||
393 | 15,15,15, | 15,15,15, | 0 | ||
394 | 255,255,255,255, | 255,255,255,255, | 0 | ||
395 | 15,15,15,15, | 15,15,15,15, | 0 | ||
396 | 16,17,18,16,17,18, |16,17,18,16,17,18,| 0 | ||
397 | 255,255,255,255,255, | 0,4,0,255, | -1 | ||
398 | 15,15,15,15,15, | 0,4,0,15, | -1 | ||
399 | 16,17,18,16,17,18,16,17,18,| 0,2,2,16,17,18, | -3 | ||
400 | 16,17,18,19,16,17,18,19 |0,1,3,16,17,18,19 | -1 | ||
401 | |||
402 | If the header (value 0) would be met, we would see: | ||
403 | |||
404 | Sequence of bytes to encode | Coded values | Differences with compression | ||
405 | | | (unit: byte) | ||
406 | -------------------------------------------------------------------------- | ||
407 | 0, | 0,0,0, | +2 | ||
408 | 255, | 255, | 0 | ||
409 | 0,0, | 0,0,1, | +1 | ||
410 | 15, | 15, | 0 | ||
411 | 0,0,0, | 0,0,2, | 0 | ||
412 | 255, | 255, | 0 | ||
413 | 0,0,0,0 | 0,0,3 | -1 | ||
414 | |||
415 | See codecs source codes: codrle3.c and dcodrle3.c | ||
416 | |||
417 | *** Fourth RLE scheme *** | ||
418 | |||
419 | This last RLE algorithm better handles repetitions of any kind (one byte | ||
420 | and more) and non repetitions, including few non repetitions, and does not | ||
421 | read the source by twice as RLE type 3. | ||
422 | |||
423 | Compression scheme is: | ||
424 | |||
425 | First byte=Next byte? | ||
426 | /\ | ||
427 | Yes / \ No | ||
428 | / \ | ||
429 | 1 bit '0' 1 bit '1' | ||
430 | / \ | ||
431 | / \ | ||
432 | Count the Motif of several | ||
433 | occurrences repeated byte? | ||
434 | of 1 repeated ( 65 bytes repeated | ||
435 | byte (maximum 257 times maxi) | ||
436 | 16449 times) /\ | ||
437 | /\ / \ | ||
438 | / \ / \ | ||
439 | / \ / \ | ||
440 | / \ / \ | ||
441 | 1 bit '0' 1 bit '1' 1 bit '0' 1 bit '1' | ||
442 | + 6 bits + 14 bits + 6 bits of | | ||
443 | giving the giving the the length Number of non repetition | ||
444 | length (-2) length (-66) of the motif (maximum 8224) | ||
445 | of the of the + 8 bits of /\ | ||
446 | repeated byte repeated byte the number (-2) < 33 / \ > 32 | ||
447 | + repeated byte + repeated byte of repetition / \ | ||
448 | | | + bytes of the 1 bit '0' 1 bit '1' | ||
449 | | | motif + 5 bits of + 13 bits | ||
450 | | | | the numer (-1) of the | ||
451 | | | | of non number (-33) | ||
452 | | | | repetition of repetition | ||
453 | | | | + non + non | ||
454 | | | | repeated repeated | ||
455 | | | | bytes bytes | ||
456 | | | | | | | ||
457 | | | | | 111xxxxx,xxxxxxxx,n bytes | ||
458 | | | | | [-------------------------] | ||
459 | | | | | | ||
460 | | | | 110xxxxx,n bytes | ||
461 | | | | [----------------] | ||
462 | | | | | ||
463 | | | 10xxxxxx,yyyyyyyy,n bytes | ||
464 | | | [-------------------------] | ||
465 | | | | ||
466 | | 01xxxxxx,xxxxxxxx,1 byte | ||
467 | | [------------------------] | ||
468 | | | ||
469 | 00xxxxxx,1 byte | ||
470 | [---------------] | ||
471 | |||
472 | Example, same as previously: | ||
473 | |||
474 | Sequence of bytes to encode | Coded values | Differences with compression | ||
475 | | | (unit: byte) | ||
476 | -------------------------------------------------------------------------- | ||
477 | 255,15 | 11000001b,255,15, | +1 | ||
478 | 255,255 | 00000000b,255, | 0 | ||
479 | 15,15 | 00000000b,15, | 0 | ||
480 | 255,255,255 | 00000001b,255, | -1 | ||
481 | 15,15,15 | 00000001b,15, | -1 | ||
482 | 255,255,255,255 | 00000010b,255, | -2 | ||
483 | 15,15,15,15 | 00000010b,15, | -2 | ||
484 | 16,17,18,16,17,18 |10000001b,0,16,17,18,| -1 | ||
485 | 255,255,255,255,255 | 00000011b,255, | -3 | ||
486 | 15,15,15,15,15 | 00000011b,15, | -3 | ||
487 | 16,17,18,16,17,18,16,17,18 | 10000001b,16,17,18, | -4 | ||
488 | 16,17,18,19,16,17,18,19 |10000010b,16,17,18,19| -2 | ||
489 | |||
490 | +==========================================================+ | ||
491 | | The Huffman encoding | | ||
492 | +==========================================================+ | ||
493 | |||
494 | This method comes from the searcher who established the algorithm in 1952. | ||
495 | This method allows both a dynamic and static statistic schemes. A statistic | ||
496 | scheme works on the data occurrences. It is not as with RLE where you had | ||
497 | a consideration of the current occurrence of a frame but rather a consideration | ||
498 | of the global occurrences of each data in the input stream. In this last case, | ||
499 | frames can be any kinds of sequences you want. On the other hand, Huffman | ||
500 | static encoding appears in some compressers such as ARJ on PCs. This enforces | ||
501 | the encoder to consider every statistic as the same for all the data you have. | ||
502 | Of course, the results are not as good as if it were a dynamic encoding. | ||
503 | The static encoding is faster than the dynamic encoding but the dynamic encoding | ||
504 | will be adapted to the statistic of the bytes of the input stream and will | ||
505 | of course become more efficient by producing shortest output. | ||
506 | |||
507 | The main idea in Huffman encoding is to re-code every byte with regard to its | ||
508 | occurrence. The more frequent bytes in the data to compress will be encoded with | ||
509 | less than 8 bits and the others could need 8 bits see even more to be encoded. | ||
510 | You immediately see that the codes associated to the different bytes won't have | ||
511 | identical size. The Huffman method will actually require that the binary codes | ||
512 | have not a fixed size. We speak then about variable length codes. | ||
513 | |||
514 | The dynamical Huffman scheme needs the binary trees for the encoding. This | ||
515 | enables you to obtain the best codes, adapted to the source data. | ||
516 | The demonstration won't be given there. To help the neophyt, I will just explain | ||
517 | what is a binary tree. | ||
518 | |||
519 | A binary tree is special fashion to represent the data. A binary tree is | ||
520 | a structure with an associated value with two pointers. The term of binary has | ||
521 | been given because of the presence of two pointers. Because of some conventions, | ||
522 | one of the pointer is called left pointer and the second pointer is called right | ||
523 | pointer. Here is a visual representation of a binary tree. | ||
524 | |||
525 | Value | ||
526 | / \ | ||
527 | / \ | ||
528 | Value Value | ||
529 | / \ / \ | ||
530 | ... ... ... ... | ||
531 | |||
532 | One problem with a binary encoding is a prefix problem. A prefix is the first | ||
533 | part of the representation of a value, e.g. "h" and "he" are prefixes of "hello" | ||
534 | but not "el". To understand the problem, let's code the letters "A", "B", "C", | ||
535 | "D", and "E" respectively as 00b, 01b, 10b, 11b, and 100b. When you read | ||
536 | the binary sequence 00100100b, you are unable to say if this comes from "ACBA" | ||
537 | or "AEE". To avoid such situations, the codes must have a prefix property. | ||
538 | And the letter "E" mustn't begin with the sequence of an other code. With "A", | ||
539 | "B", "C", "D", and "E" respectively affected with 1b, 01b, 001b, 0001b, and | ||
540 | 0000b, the sequence 1001011b will only be decoded as "ACBA". | ||
541 | |||
542 | 1 0 | ||
543 | <- /\ -> | ||
544 | / \ | ||
545 | "A" /\ | ||
546 | "B" \ | ||
547 | /\ | ||
548 | "C" \ | ||
549 | /\ | ||
550 | "D" "E" | ||
551 | |||
552 | As you see, with this tree, an encoding will have the prefix property | ||
553 | if the bytes are at the end of each "branch" and you have no byte at the "node". | ||
554 | You also see that if you try to reach a character by the right pointer you add | ||
555 | a bit set to 0 and by the left pointer, you add a bit set to 1 to the current | ||
556 | code. The previous *bad* encoding provide the following bad tree: | ||
557 | |||
558 | /\ | ||
559 | / \ | ||
560 | / \ | ||
561 | /\ /\ | ||
562 | / \ "B" "A" | ||
563 | / \ | ||
564 | "D" "C"\ | ||
565 | / \ | ||
566 | "E" | ||
567 | |||
568 | You see here that the coder shouldn't put the "C" at a node... | ||
569 | |||
570 | As you see, the largest binary code are those with the longest distance | ||
571 | from the top of the tree. Finally, the more frequent bytes will be the highest | ||
572 | in the tree in order you have the shortest encoding and the less frequent bytes | ||
573 | will be the lowest in the tree. | ||
574 | |||
575 | From an algorithmic point of view, you make a list of each byte you encountered | ||
576 | in the stream to compress. This list will always be sorted. The zero-occurrence | ||
577 | bytes are removed from this list. You take the two bytes with the smallest | ||
578 | occurrences in the list. Whenever two bytes have the same "weight", you take two | ||
579 | of them regardless to their ASCII value. You join them in a node. This node will | ||
580 | have a fictive byte value (256 will be a good one!) and its weight will be | ||
581 | the sum of the two joined bytes. You replace then the two joined bytes with | ||
582 | the fictive byte. And you continue so until you have one byte (fictive or not) | ||
583 | in the list. Of course, this process will produce the shortest codes if the list | ||
584 | remains sorted. I will not explain with arcana hard maths why the result | ||
585 | is a set of the shortest bytes... | ||
586 | |||
587 | Important: I use as convention that the right sub-trees have a weight greater | ||
588 | or equal to the weight of the left sub-trees. | ||
589 | |||
590 | Example: Let's take a file to compress where we notice the following | ||
591 | occurrences: | ||
592 | |||
593 | Listed bytes | Frequences (Weight) | ||
594 | ---------------------------------- | ||
595 | 0 | 338 | ||
596 | 255 | 300 | ||
597 | 31 | 280 | ||
598 | 77 | 24 | ||
599 | 115 | 21 | ||
600 | 83 | 20 | ||
601 | 222 | 5 | ||
602 | |||
603 | We will begin by joining the bytes 83 and 222. This will produce a fictive node | ||
604 | 1 with a weight of 20+5=25. | ||
605 | |||
606 | (Fictive 1,25) | ||
607 | /\ | ||
608 | / \ | ||
609 | (222,5) (83,20) | ||
610 | |||
611 | Listed bytes | Frequences (Weight) | ||
612 | ---------------------------------- | ||
613 | 0 | 338 | ||
614 | 255 | 300 | ||
615 | 31 | 280 | ||
616 | Fictive 1 | 25 | ||
617 | 77 | 24 | ||
618 | 115 | 21 | ||
619 | |||
620 | Note that the list is sorted... The smallest values in the frequences are 21 and | ||
621 | 24. That is why we will take the bytes 77 and 115 to build the fictive node 2. | ||
622 | |||
623 | (Fictive 2,45) | ||
624 | /\ | ||
625 | / \ | ||
626 | (115,21) (77,25) | ||
627 | |||
628 | Listed bytes | Frequences (Weight) | ||
629 | ---------------------------------- | ||
630 | 0 | 338 | ||
631 | 255 | 300 | ||
632 | 31 | 280 | ||
633 | Fictive 2 | 45 | ||
634 | Fictive 1 | 25 | ||
635 | |||
636 | The nodes with smallest weights are the fictive 1 and 2 nodes. These are joined | ||
637 | to build the fictive node 3 whose weight is 40+25=70. | ||
638 | |||
639 | (Fictive 3,70) | ||
640 | / \ | ||
641 | / \ | ||
642 | / \ | ||
643 | /\ / \ | ||
644 | / \ / \ | ||
645 | (222,5) (83,20) (115,21) (77,25) | ||
646 | |||
647 | Listed bytes | Frequences (Weight) | ||
648 | ---------------------------------- | ||
649 | 0 | 338 | ||
650 | 255 | 300 | ||
651 | 31 | 280 | ||
652 | Fictive 3 | 70 | ||
653 | |||
654 | The fictive node 3 is linked to the byte 31. Total weight: 280+70=350. | ||
655 | |||
656 | (Fictive 4,350) | ||
657 | / \ | ||
658 | / \ | ||
659 | / \ | ||
660 | / \ (31,280) | ||
661 | / \ | ||
662 | / \ | ||
663 | /\ / \ | ||
664 | / \ / \ | ||
665 | (222,5) (83,20) (115,21) (77,25) | ||
666 | |||
667 | Listed bytes | Frequences (Weight) | ||
668 | ---------------------------------- | ||
669 | Fictive 4 | 350 | ||
670 | 0 | 338 | ||
671 | 255 | 300 | ||
672 | |||
673 | As you see, being that we sort the list, the fictive node 4 has become the first | ||
674 | of the list. We join the bytes 0 and 255 in a same fictive node, the number 5 | ||
675 | whose weight is 338+300=638. | ||
676 | |||
677 | (Fictive 5,638) | ||
678 | /\ | ||
679 | / \ | ||
680 | (255,300) (0,338) | ||
681 | |||
682 | Listed bytes | Frequences (Weight) | ||
683 | ---------------------------------- | ||
684 | Fictive 5 | 638 | ||
685 | Fictive 4 | 350 | ||
686 | |||
687 | The fictive nodes 4 and 5 are finally joined. Final weight: 638+350=998 bytes. | ||
688 | It is actually the total byte number in the initial file: 338+300+24+21+20+5. | ||
689 | |||
690 | (Tree,998) | ||
691 | 1 / \ 0 | ||
692 | <- / \ -> | ||
693 | / \ | ||
694 | / \ | ||
695 | / \ | ||
696 | / \ / \ | ||
697 | / \ / \ | ||
698 | / \ / \ | ||
699 | / \ (31,280) (255,300) (0,338) | ||
700 | / \ | ||
701 | / \ | ||
702 | /\ / \ | ||
703 | / \ / \ | ||
704 | (222,5) (83,20) (115,21) (77,25) | ||
705 | |||
706 | Bytes | Huffman codes | Frequences | Binary length*Frequence | ||
707 | ------------------------------------------------------------ | ||
708 | 0 | 00b | 338 | 676 | ||
709 | 255 | 01b | 300 | 600 | ||
710 | 31 | 10b | 280 | 560 | ||
711 | 77 | 1101b | 24 | 96 | ||
712 | 115 | 1100b | 21 | 84 | ||
713 | 83 | 1110b | 20 | 80 | ||
714 | 222 | 1111b | 5 | 20 | ||
715 | |||
716 | Results: Original file size: (338+300+280+24+21+20+5)*8=7904 bits (=998 bytes) | ||
717 | versus 676+600+560+96+84+80+20=2116 bits, i.e. 2116/8=265 bytes. | ||
718 | |||
719 | Now you know how to code an input stream. The last problem is to decode all this | ||
720 | stuff. Actually, when you meet a binary sequence you can't say whether it comes | ||
721 | from such byte list or such other one. Furthermore, if you change the occurrence | ||
722 | of one or two bytes, you won't obtain the same resulting binary tree. Try for | ||
723 | example to encode the previous list but with the following occurrences: | ||
724 | |||
725 | Listed bytes | Frequences (Weight) | ||
726 | ---------------------------------- | ||
727 | 255 | 418 | ||
728 | 0 | 300 | ||
729 | 31 | 100 | ||
730 | 77 | 24 | ||
731 | 115 | 21 | ||
732 | 83 | 20 | ||
733 | 222 | 5 | ||
734 | |||
735 | As you can observe it, the resulting binary tree is quite different, we had yet | ||
736 | the same initial bytes. To not be in such a situation we will put an header | ||
737 | in front of all data. I can't comment longly this header but I can say | ||
738 | I minimize it as much as I could. The header is divided into two parts. | ||
739 | The first part of this header looks closely to a boolean table (coded more or | ||
740 | less in binary to save space) and the second part provide to the decoder | ||
741 | the binary code associated to each byte encountered in the original input | ||
742 | stream. | ||
743 | |||
744 | Here is a summary of the header: | ||
745 | |||
746 | First part | ||
747 | ---------- | ||
748 | First bit | ||
749 | / \ | ||
750 | 1 / \ 0 | ||
751 | / \ | ||
752 | 256 bits set to 0 or 1 5 bits for the number n (minus 1) | ||
753 | depending whether the of bytes encountered | ||
754 | corresponding byte was in the file to compres | ||
755 | in the file to compress | | ||
756 | (=> n bits set to 1, \ / | ||
757 | n>32) n values of 8-bits (n<=32) | ||
758 | \ / | ||
759 | \ / | ||
760 | \ / | ||
761 | Second part | | ||
762 | ----------- | | ||
763 | | | ||
764 | +------------->| | ||
765 | (n+1) times | | | ||
766 | (n bytes of | First bit? | ||
767 | the values | / \ | ||
768 | encountered | 1 / \ 0 | ||
769 | in the | / \ | ||
770 | source file | 8 bits of 5 bits of the | ||
771 | + the code | the length length (-1) | ||
772 | of a | (-1) of the of the following | ||
773 | fictive | following binary | ||
774 | byte | binary code code | ||
775 | to stop the | (length>32) (length<=32) | ||
776 | decoding. | \ / | ||
777 | The fictive | \ / | ||
778 | is set to | \ / | ||
779 | 256 in the | | | ||
780 | Huffman | binary code | ||
781 | -tree of | | | ||
782 | encoding) +--------------| | ||
783 | | | ||
784 | Binary encoding of the source file | ||
785 | | | ||
786 | Code of end of encoding | ||
787 | | | ||
788 | |||
789 | |||
790 | With my codecs I can handle binary sequences with a length of 256 bits. | ||
791 | This correspond to encode all the input stream from one byte to infinite length. | ||
792 | In fact if a byte had a range from 0 to 257 instead of 0 to 255, I would have a | ||
793 | bug with my codecs with an input stream of at least 370,959,230,771,131,880,927, | ||
794 | 453,318,055,001,997,489,772,178,180,790,105 bytes !!! | ||
795 | |||
796 | Where come this explosive number? In fact, to have a severe bug, I must have | ||
797 | a completely unbalanced tree: | ||
798 | |||
799 | Tree | ||
800 | /\ | ||
801 | \ | ||
802 | /\ | ||
803 | \ | ||
804 | /\ | ||
805 | \ | ||
806 | ... | ||
807 | /\ | ||
808 | \ | ||
809 | /\ | ||
810 | |||
811 | Let's take the following example: | ||
812 | |||
813 | Listed bytes | Frequences (Weight) | ||
814 | ---------------------------------- | ||
815 | 32 | 5 | ||
816 | 101 | 3 | ||
817 | 97 | 2 | ||
818 | 100 | 1 | ||
819 | 115 | 1 | ||
820 | |||
821 | This produces the following unbalanced tree: | ||
822 | |||
823 | Tree | ||
824 | /\ | ||
825 | (32,5) \ | ||
826 | /\ | ||
827 | (101,3) \ | ||
828 | /\ | ||
829 | (97,2) \ | ||
830 | /\ | ||
831 | (115,1) (100,1) | ||
832 | |||
833 | Let's speak about a mathematical series: The Fibonacci series. It is defined as | ||
834 | following: | ||
835 | |||
836 | { Fib(0)=0 | ||
837 | { Fib(1)=1 | ||
838 | { Fib(n)=Fib(n-2)+Fib(n-1) | ||
839 | |||
840 | Fib(0)=0, Fib(1)=1, Fib(2)=1, Fib(3)=2, Fib(4)=3, Fib(5)=5, Fib(6)=8, Fib(7)=13, | ||
841 | etc. | ||
842 | |||
843 | But 1, 1, 2, 3, 5, 8 are the occurrences of our list! We can actually | ||
844 | demonstrate that to have an unbalanced tree, we have to take a list with | ||
845 | an occurrence based on the Fibonacci series (these values are minimal). | ||
846 | If the data to compress have m different bytes, when the tree is unbalanced, | ||
847 | the longest code need m-1 bits. In our little previous example where m=5, | ||
848 | the longest codes are associated to the bytes 100 and 115, respectively coded | ||
849 | 0001b and 0000b. We can also say that to have an unbalanced tree we must have | ||
850 | at least 5+3+2+1+1=12=Fib(7)-1. To conclude about all that, with a coder that | ||
851 | uses m-1 bits, you must never have an input stream size over than Fib(m+2)-1, | ||
852 | otherwise, there could be a bug in the output stream. Of course, with my codecs | ||
853 | there will never be a bug because I can deal with binary code sizes of 1 to 256 | ||
854 | bits. Some encoder could use that with m=31, Fib(31+2)-1=3,524,577 and m=32, | ||
855 | Fib(32+2)-1=5,702,886. And an encoder that uses unisgned integer of 32 bits | ||
856 | shouldn't have a bug until about 4 Gb... | ||
857 | |||
858 | +==========================================================+ | ||
859 | | The LZW encoding | | ||
860 | +==========================================================+ | ||
861 | |||
862 | The LZW scheme is due to three searchers, i.e. Abraham Lempel and Jacob Ziv | ||
863 | worked on it in 1977, and Terry Welch achieved this scheme in 1984. | ||
864 | |||
865 | LZW is patented in USA. This patent, number 4,558,302, is covered by Unisys | ||
866 | Corporation. You can usually write (without fees) software codecs which use | ||
867 | the LZW scheme but hardware companies can't do so. You may get a limited | ||
868 | licence by writting to: | ||
869 | Welch Licencing Department | ||
870 | Office of the General Counsel | ||
871 | M/S C1SW19 | ||
872 | Unisys corporation | ||
873 | Blue Bell | ||
874 | Pennsylvania, 19424 (USA) | ||
875 | |||
876 | If you're occidental, you are surely using an LZW encoding every time you are | ||
877 | speaking, especially when you use a dictionary. Let's consider, for example, | ||
878 | the word "Cirrus". As you read a dictionary, you begin with "A", "Aa", and so | ||
879 | on. But a computer has no experience and it must suppose that some words | ||
880 | already exist. That is why with "Cirrus", it supposes that "C", "Ci", "Cir", | ||
881 | "Cirr", "Cirru", and "Cirrus" exist. Of course, being that this is a computer, | ||
882 | all these words are encoded as index numbers. Every time you go forward, you add | ||
883 | a new number associated to the new word. Being that a computer is byte-based | ||
884 | and not alphabetic-based, you have an initial dictionary of 256 letters instead | ||
885 | of our 26 ('A' to 'Z') letters. | ||
886 | |||
887 | Example: Let's code "XYXYZ". First step, "X" is recognized in the initial | ||
888 | dictionary of 256 letters as the 89th. Second step, "Y" is read. Does "XY" | ||
889 | exist? No, then "XY" is stored as the word 256. You write in the output stream | ||
890 | the ASCII of "X", i.e. 88. Now "YX" is tested as not referenced in the current | ||
891 | dictionary. It is stored as the word 257. You write now in the output stream 89 | ||
892 | (ASCII of "Y"). "XY" is now met. But now "XY" is known as the reference 256. | ||
893 | Being that "XY" exists, you test the sequence with one more letter, i.e. "XYZ". | ||
894 | This last word is not referenced in the current dictionary. You write then the | ||
895 | value 256. Finally, you reach the last letter ("Z"). You add "YZ" as the | ||
896 | reference 258 but it is the last letter. That is why you just write the value | ||
897 | 90 (ASCII of "Z"). | ||
898 | |||
899 | Another encoding sample with the string "ABADABCCCABCEABCECCA". | ||
900 | |||
901 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
902 | |Step|Input|Dictionary test|Prefix|New symbol|Dictionary |Output| | ||
903 | | | | | | |D0=ASCII with 256 letters| | | ||
904 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
905 | | 1 | "A" |"A" in D0 | "A" | "B" | D1=D0 | 65 | | ||
906 | | | "B" |"AB" not in D0 | | | and "AB"=256 | | | ||
907 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
908 | | 2 | "A" |"B" in D1 | "B" | "A" | D2=D1 | 66 | | ||
909 | | | |"BA" not in D1 | | | and "BA"=257 | | | ||
910 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
911 | | 3 | "D" |"A" in D2 | "A" | "D" | D3=D2 | 65 | | ||
912 | | | |"AD" not in D2 | | | and "AD"=258 | | | ||
913 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
914 | | 4 | "A" |"D" in D3 | "D" | "A" | D4=D3 | 68 | | ||
915 | | | |"DA" not in D3 | | | and "DA"=259 | | | ||
916 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
917 | | 5 | "B" |"A" in D4 | "AB" | "C" | D5=D4 | 256 | | ||
918 | | | "C" |"AB" in D4 | | | and "ABC"=260 | | | ||
919 | | | |"ABC" not in D4| | | | | | ||
920 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
921 | | 6 | "C" |"C" in D5 | "C" | "C" | D6=D5 | 67 | | ||
922 | | | |"CC" not in D5 | | | and "CC"=261 | | | ||
923 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
924 | | 7 | "C" |"C" in D6 | "CC" | "A" | D7=D6 | 261 | | ||
925 | | | "A" |"CC" in D6 | | | and "CCA"=262 | | | ||
926 | | | |"CCA" not in D6| | | | | | ||
927 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
928 | | 8 | "B" |"A" in D7 | "ABC"| "E" | D8=D7 | 260 | | ||
929 | | | "C" |"AB" in D7 | | | and "ABCE"=263 | | | ||
930 | | | "E" |"ABC" in D7 | | | | | | ||
931 | | | <"ABCE" not in D7| | | | | | ||
932 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
933 | | 9 | "A" |"E" in D8 | "E" | "A" | D9=D8 | 69 | | ||
934 | | | |"EA" not in D8 | | | and "EA"=264 | | | ||
935 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
936 | | 10 | "B" |"A" in D9 |"ABCE"| "C" | D10=D9 | 263 | | ||
937 | | | "C" |"AB" in D9 | | | and "ABCEC"=265 | | | ||
938 | | | "E" |"ABC" in D9 | | | | | | ||
939 | | | "C" |"ABCE" in D9 | | | | | | ||
940 | | | <"ABCEC" not in D9> | | | | | ||
941 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
942 | | 11 | "C" |"C" in D10 | "CCA"| | | 262 | | ||
943 | | | "A" |"CC" in D10 | | | | | | ||
944 | | | <"CCA" not in D10| | | | | | ||
945 | +----+-----+---------------+------+----------+-------------------------+------+ | ||
946 | |||
947 | You will notice a problem with the above output: How to write a code of 256 | ||
948 | (for example) on 8 bits? It's simple to solve this problem. You just say that | ||
949 | the encoding starts with 9 bits and as you reach the 512th word, you use a | ||
950 | 10-bits encoding. With 1024 words, you use 11 bits; with 2048 words, 12 bits; | ||
951 | and so on with all numbers of 2^n (n is positive). To better synchronize | ||
952 | the coder and the decoder with all that, most of implementations use two | ||
953 | additional references. The word 256 is a code of reinitialisation (the codec | ||
954 | must reinitialize completely the current dictionary to its 256 initial letters) | ||
955 | and the word 257 is a code of end of information (no more data to read). | ||
956 | Of course, you start your first new word as the code number 258. | ||
957 | |||
958 | You can also do so as in the GIF file format and start with an initial | ||
959 | dictionary of 18 words to code an input stream with only letters coded on 4 bits | ||
960 | (you start with codes of 5 bits in the output stream!). The 18 initial words | ||
961 | are: 0 to 15 (initial letters), 16 (reinit the dictionary), and 17 (end of | ||
962 | information). First new word has code 18, second word, code 19, ... | ||
963 | |||
964 | Important: You can consider that your dictionary is limited to 4096 different | ||
965 | words (as in GIF and TIFF file formats). But if your dictionary is full, you | ||
966 | can decide to send old codes *without* reinitializing the dictionary. All the | ||
967 | decoders must be compliant with this. This enables you to consider that it is | ||
968 | not efficient to reinitialize the full dictionary. Instead of this, you don't | ||
969 | change the dictionary and you send/receive (depending if it's a coder or a | ||
970 | decoder) existing codes in the full dictionary. | ||
971 | |||
972 | My codecs are able to deal as well with most of initial size of data in the | ||
973 | initial dictionary as with full dictionary. | ||
974 | |||
975 | Let's see how to decode an LZW encoding. We saw with true dynamical Huffman | ||
976 | scheme that you needed an header in the encoding codes. Any header is useless | ||
977 | in LZW scheme. When two successive bytes are read, the first must exist in the | ||
978 | dictionary. This code can be immediately decoded and written in the output | ||
979 | stream. If the second code is equal or less than the word number in the current | ||
980 | dictionary, this code is decoded as the first one. At the opposite, if the | ||
981 | second code is equal to the word number in dictionary plus one, this means you | ||
982 | have to write a word composed with the word (the sentence, not the code number) | ||
983 | of the last code plus the first character of the last code. In between, you make | ||
984 | appear a new word. This new word is the one you just sent to the output stream, | ||
985 | it means composed by all the letters of the word associated to the first code | ||
986 | and the first letter of the word of the second code. You continue the processing | ||
987 | with the second and third codes read in the input stream (of codes)... | ||
988 | |||
989 | Example: Let's decode the previous encoding given a bit more above. | ||
990 | |||
991 | +------+-------+----------------+----------+------------------+--------+ | ||
992 | | Step | Input | Code to decode | New code | Dictionary | Output | | ||
993 | +------+-------+----------------+----------+------------------+--------+ | ||
994 | | 1 | 65 | 65 | 66 | 65,66=256 | "A" | | ||
995 | | | 66 | | | | | | ||
996 | +------+-------+----------------+----------+------------------+--------+ | ||
997 | | 2 | 65 | 66 | 65 | 66,65=257 | "B" | | ||
998 | +------+-------+----------------+----------+------------------+--------+ | ||
999 | | 3 | 68 | 65 | 68 | 65,68=258 | "A" | | ||
1000 | +------+-------+----------------+----------+------------------+--------+ | ||
1001 | | 4 | 256 | 68 | 256 | 68,65=259 | "D" | | ||
1002 | +------+-------+----------------+----------+------------------+--------+ | ||
1003 | | 5 | 67 | 256 | 67 | 65,66,67=260 | "AB" | | ||
1004 | +------+-------+----------------+----------+------------------+--------+ | ||
1005 | | 6 | 261 | 67 | 261 | 67,67=261 | "C" | | ||
1006 | +------+-------+----------------+----------+------------------+--------+ | ||
1007 | | 7 | 260 | 261 | 260 | 67,67,65=262 | "CC" | | ||
1008 | +------+-------+----------------+----------+------------------+--------+ | ||
1009 | | 8 | 69 | 260 | 69 | 65,66,67,69=263 | "ABC" | | ||
1010 | +------+-------+----------------+----------+------------------+--------+ | ||
1011 | | 9 | 263 | 69 | 263 | 69,65=264 | "E" | | ||
1012 | +------+-------+----------------+----------+------------------+--------+ | ||
1013 | | 10 | 262 | 263 | 262 |65,66,67,69,67=256| "ABCE" | | ||
1014 | +------+-------+----------------+----------+------------------+--------+ | ||
1015 | | 11 | | 262 | | | "CCA" | | ||
1016 | +------+-------+----------------+----------+------------------+--------+ | ||
1017 | |||
1018 | Summary: The step 4 is an explicit example. The code to decode is 68 ("D" in | ||
1019 | ASCII) and the new code is 256. The new word to add to the dictionary is the | ||
1020 | letters of the first word plus the the first letter of the second code (code | ||
1021 | 256), i.e. 65 ("A" in ASCII) plus 68 ("D"). So the new word has the letters 68 | ||
1022 | and 65 ("AD"). | ||
1023 | |||
1024 | The step 6 is quite special. The first code to decode is referenced but the | ||
1025 | second new code is not referenced being that the dictionary is limited to 260 | ||
1026 | referenced words. We have to make it as the second previously given case, it | ||
1027 | means you must take the word to decode plus its first letter, i.e. "C"+"C"="CC". | ||
1028 | Be care, if any encountered code is *upper* than the dictionary size plus 1, it | ||
1029 | means you have a problem in your data and/or your codecs are...bad! | ||
1030 | |||
1031 | Tricks to improve LZW encoding (but it becomes a non-standard encoding): | ||
1032 | - To limit the dictionary to an high amount of words (4096 words maximum enable | ||
1033 | you to encode a stream of a maximmum 7,370,880 letters with the same dictionary) | ||
1034 | - To use a dictionary of less than 258 if possible (example, with 16 color | ||
1035 | pictures, you start with a dictionary of 18 words) | ||
1036 | - To not reinitialize a dictionary when it is full | ||
1037 | - To reinitialize a dictionary with the most frequent of the previous dictionary | ||
1038 | - To use the codes from (current dictionary size+1) to (maximum dictionary size) | ||
1039 | because these codes are not used in the standard LZW scheme. | ||
1040 | Such a compression scheme has been used (successfully) by Robin Watts | ||
1041 | <ct93008@ox.ac.uk>. | ||
1042 | |||
1043 | +==========================================================+ | ||
1044 | | Summary | | ||
1045 | +==========================================================+ | ||
1046 | |||
1047 | ------------------------------------------------- | ||
1048 | RLE type 1: | ||
1049 | Fastest compression. Good ratio for general purpose. | ||
1050 | Doesn't need to read the data by twice. | ||
1051 | Decoding fast. | ||
1052 | ------------------------------------------------- | ||
1053 | RLE type 2: | ||
1054 | Fast compression. Very good ratio in general (even for general purposes). | ||
1055 | Need to read the data by twice. | ||
1056 | Decoding fast. | ||
1057 | ------------------------------------------------- | ||
1058 | RLE type 3: | ||
1059 | Slowest compression. Good ratio on image file,quite middle for general purposes. | ||
1060 | Need to read the data by twice. | ||
1061 | Change line: | ||
1062 | #define MAX_RASTER_SIZE 256 | ||
1063 | into: | ||
1064 | #define MAX_RASTER_SIZE 16 | ||
1065 | to speed up the encoding (but the result decreases in ratio). If you compress | ||
1066 | with memory buffers, do not modify this line... | ||
1067 | Decoding fast. | ||
1068 | ------------------------------------------------- | ||
1069 | RLE type 4: | ||
1070 | Slow compression. Good ratio on image file, middle in general purposes. | ||
1071 | Change line: | ||
1072 | #define MAX_RASTER_SIZE 66 | ||
1073 | into: | ||
1074 | #define MAX_RASTER_SIZE 16 | ||
1075 | to speed up the encoding (but the result decreases in ratio). If you compress | ||
1076 | with memory buffers, do not modify this line... | ||
1077 | Decoding fast. | ||
1078 | ------------------------------------------------- | ||
1079 | Huffman: | ||
1080 | Fast compression. Good ratio on text files and similar, middle for general | ||
1081 | purposes. Interesting method to use to compress a buffer already compressed by | ||
1082 | RLE types 1 or 2 methods... | ||
1083 | Decoding fast. | ||
1084 | ------------------------------------------------- | ||
1085 | LZW: | ||
1086 | Quite fast compression. Good, see even very good ratio, for general purposes. | ||
1087 | Bigger the data are, better the compression ratio is. | ||
1088 | Decoding quite fast. | ||
1089 | ------------------------------------------------- | ||
1090 | |||
1091 | The source codes work on all kinds of computers with a C compiler. | ||
1092 | With the compiler, optimize the speed run option instead of space option. | ||
1093 | With UNIX system, it's better to compile them with option -O. | ||
1094 | If you don't use a GNU compiler, the source file MUST NOT have a size | ||
1095 | over 4 Gb for RLE 2, 3, and Huffman, because I count the number | ||
1096 | of occurrences of the bytes. | ||
1097 | So, with GNU compilers, 'unsigned lont int' is 8 bytes instead of 4 bytes | ||
1098 | (as normal C UNIX compilers and PCs' compilers, such as Microsoft C++ | ||
1099 | and Borland C++). | ||
1100 | Actually: | ||
1101 | * Normal UNIX compilers, => 4 Gb (unsigned long int = 4 bytes) | ||
1102 | Microsoft C++ and Borland C++ for PCs | ||
1103 | * GNU UNIX compilers => 17179869184 Gb (unsigned long int = 8 bytes) | ||
1104 | |||
1105 | +==========================================================+ | ||
1106 | | END | | ||
1107 | +==========================================================+ | ||
diff --git a/baseline/source/huff_dec/huff_dec.c b/baseline/source/huff_dec/huff_dec.c new file mode 100644 index 0000000..48bdf4b --- /dev/null +++ b/baseline/source/huff_dec/huff_dec.c | |||
@@ -0,0 +1,393 @@ | |||
1 | /* | ||
2 | |||
3 | This program is part of the TACLeBench benchmark suite. | ||
4 | Version V 2.0 | ||
5 | |||
6 | Name: huff_dec | ||
7 | |||
8 | Author: David Bourgin (David.Bourgin@ufrima.imag.fr) | ||
9 | |||
10 | Function: Example of Huffman decoding | ||
11 | |||
12 | Source: ftp://turing.imag.fr/pub/compression/ (1994-09-26) | ||
13 | |||
14 | Original name: dcodhuff.c | ||
15 | |||
16 | Changes: I/O to char arrays instead of file i/o. | ||
17 | Dynamic memory allocation replaced by array. | ||
18 | |||
19 | License: | ||
20 | |||
21 | The source code files (codrl1.c, dcodrl1.c, codrle2.c, dcodrle2.c, codrle3.c, | ||
22 | dcodrle3.c, codrle4.c, dcodrle4.c, codhuff.c, dcodhuff.c) are copyrighted. | ||
23 | They have been uploaded on ftp in turing.imag.fr (129.88.31.7):/pub/compression | ||
24 | on 22/5/94 and have been modified on 22/9/94. | ||
25 | (c) David Bourgin - 1994 | ||
26 | The source codes I provide have no buggs (!) but being that I make them | ||
27 | available for free I have some notes to make. They can change at any time | ||
28 | without notice. I assume no responsability or liability for any errors or | ||
29 | inaccurracies, make no warranty of any kind (express, implied or statutory) | ||
30 | with respect to this publication and expressly disclaim any and all warranties | ||
31 | of merchantability, fitness for particular purposes. Of course, if you have | ||
32 | some problems to use the information presented here, I will try to help you if | ||
33 | I can. | ||
34 | |||
35 | If you include the source codes in your application, here are the conditions: | ||
36 | - You have to put my name in the header of your source file (not in the | ||
37 | excutable program if you don't want) (this item is a must) | ||
38 | - I would like to see your resulting application, if possible (this item is not | ||
39 | a must, because some applications must remain secret) | ||
40 | - Whenever you gain money with your application, I would like to receive a very | ||
41 | little part in order to be encouraged to update my source codes and to develop | ||
42 | new schemes (this item is not a must) | ||
43 | |||
44 | */ | ||
45 | |||
46 | |||
47 | /* | ||
48 | Declaration of types | ||
49 | */ | ||
50 | |||
51 | |||
52 | #include "../extra.h" | ||
53 | typedef struct s_tree { | ||
54 | unsigned int byte; /* A byte has to be coded as an unsigned integer to | ||
55 | allow a node to have a value over 255 */ | ||
56 | struct s_tree *left_ptr; | ||
57 | struct s_tree *right_ptr; | ||
58 | } huff_dec_t_tree; | ||
59 | |||
60 | typedef struct { | ||
61 | unsigned char bits[32]; | ||
62 | unsigned int bits_nb; | ||
63 | unsigned char presence; | ||
64 | } t_bin_val; | ||
65 | |||
66 | |||
67 | /* | ||
68 | Forward declaration of functions | ||
69 | */ | ||
70 | |||
71 | void huff_dec_init( void ); | ||
72 | int huff_dec_return( void ); | ||
73 | int huff_dec_end_of_data(); | ||
74 | int huff_dec_read_byte(); | ||
75 | void huff_dec_write_byte( char ch ); | ||
76 | unsigned char huff_dec_read_code_1_bit(); | ||
77 | unsigned int huff_dec_read_code_n_bits( unsigned int n ); | ||
78 | void huff_dec_read_header( t_bin_val codes_table[257] ); | ||
79 | huff_dec_t_tree *huff_dec_tree_encoding( t_bin_val codes_table[257], | ||
80 | huff_dec_t_tree heap[514] ); | ||
81 | void huff_dec_main( void ); | ||
82 | //int main( void ); | ||
83 | |||
84 | |||
85 | /* | ||
86 | Declaration of global variables | ||
87 | */ | ||
88 | |||
89 | static int huff_dec_input_pos; | ||
90 | static int huff_dec_output_pos; | ||
91 | static char huff_dec_output[1024]; | ||
92 | unsigned char huff_dec_byte_nb_to_read = 0; | ||
93 | unsigned int huff_dec_val_to_read = 0; | ||
94 | |||
95 | |||
96 | /* | ||
97 | Initialization- and return-value-related functions | ||
98 | */ | ||
99 | |||
100 | #define huff_dec_plaintext_len 600 | ||
101 | static const char *huff_dec_plaintext = | ||
102 | "You are doubtless asking \"How can I reduce the data size without losing " | ||
103 | "some informations?\". It's easy to answer to this question. I'll only take " | ||
104 | "an example. I'm sure you have heard about the morse. This system established " | ||
105 | "in the 19th century use a scheme very close to the huffman one. In the morse " | ||
106 | "you encode the letters to transmit with two kinds of signs. If you encode " | ||
107 | "these two sign possibilities in one bit, the symbol 'e' is transmitted in a " | ||
108 | "single bit and the symbols 'y' and 'z' need four bits. Look at the symbols " | ||
109 | "in the text you are reading, you'll fast understand the compression ratio..."; | ||
110 | |||
111 | #define huff_dec_encoded_len 419 | ||
112 | static unsigned char huff_dec_encoded[huff_dec_encoded_len] = { | ||
113 | 128, 0, 0, 0, 80, 133, 32, 32, 128, 100, 4, 32, 63, 239, 255, 240, | ||
114 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | ||
115 | 4, 7, 167, 21, 129, 232, 69, 120, 132, 217, 20, 162, 19, 164, 39, 133, | ||
116 | 252, 138, 105, 20, 194, 19, 129, 240, 172, 138, 248, 150, 11, 11, 240, 201, | ||
117 | 68, 64, 114, 53, 17, 42, 37, 195, 128, 212, 116, 194, 41, 98, 52, 51, | ||
118 | 12, 132, 112, 244, 3, 36, 33, 52, 39, 135, 164, 33, 62, 156, 87, 14, | ||
119 | 110, 22, 87, 50, 85, 198, 99, 142, 140, 194, 81, 78, 158, 84, 129, 254, | ||
120 | 129, 248, 110, 179, 159, 192, 145, 133, 184, 184, 28, 210, 96, 146, 73, 10, | ||
121 | 226, 21, 83, 152, 74, 13, 111, 132, 199, 202, 219, 241, 74, 193, 167, 105, | ||
122 | 222, 31, 147, 6, 55, 31, 129, 40, 232, 52, 153, 160, 148, 18, 36, 197, | ||
123 | 45, 216, 202, 86, 30, 31, 177, 90, 133, 138, 248, 23, 81, 195, 160, 100, | ||
124 | 215, 93, 50, 185, 225, 251, 23, 6, 230, 225, 229, 112, 71, 80, 96, 141, | ||
125 | 205, 176, 230, 85, 196, 9, 24, 93, 90, 121, 225, 76, 68, 152, 63, 25, | ||
126 | 107, 140, 101, 204, 214, 77, 26, 194, 96, 18, 48, 77, 210, 137, 1, 253, | ||
127 | 4, 230, 248, 56, 240, 224, 111, 163, 95, 10, 12, 223, 7, 234, 167, 129, | ||
128 | 40, 36, 96, 135, 125, 245, 250, 2, 198, 120, 127, 0, 145, 133, 213, 167, | ||
129 | 135, 149, 195, 67, 235, 108, 9, 24, 87, 17, 102, 152, 37, 4, 222, 131, | ||
130 | 188, 144, 73, 36, 128, 73, 20, 81, 152, 177, 133, 248, 28, 165, 131, 120, | ||
131 | 127, 240, 242, 184, 104, 125, 109, 129, 35, 30, 4, 145, 65, 202, 88, 9, | ||
132 | 138, 103, 44, 205, 100, 167, 24, 152, 11, 24, 51, 37, 66, 9, 24, 31, | ||
133 | 174, 202, 212, 49, 152, 18, 96, 155, 208, 119, 146, 45, 97, 48, 56, 28, | ||
134 | 194, 90, 224, 204, 144, 232, 176, 36, 96, 126, 187, 43, 83, 12, 121, 129, | ||
135 | 209, 96, 197, 35, 2, 54, 176, 249, 92, 208, 204, 145, 188, 41, 170, 180, | ||
136 | 71, 16, 36, 96, 126, 187, 43, 83, 19, 0, 145, 129, 100, 209, 15, 43, | ||
137 | 135, 55, 6, 238, 180, 194, 90, 17, 229, 115, 21, 168, 251, 140, 131, 162, | ||
138 | 217, 166, 93, 22, 4, 140, 31, 91, 166, 55, 25, 202, 192, 111, 20, 171, | ||
139 | 207, 39, 192, | ||
140 | }; | ||
141 | |||
142 | |||
143 | void huff_dec_init( void ) | ||
144 | { | ||
145 | huff_dec_input_pos = 0; | ||
146 | huff_dec_output_pos = 0; | ||
147 | } | ||
148 | |||
149 | |||
150 | int huff_dec_return( void ) | ||
151 | { | ||
152 | int i; | ||
153 | _Pragma( "loopbound min 1 max 600" ) | ||
154 | for ( i = 0; i < huff_dec_plaintext_len; i++ ) { | ||
155 | if ( huff_dec_plaintext[i] != huff_dec_output[i] ) return i + 1; | ||
156 | } | ||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | |||
161 | /* | ||
162 | Input / output functions | ||
163 | */ | ||
164 | |||
165 | int huff_dec_end_of_data() | ||
166 | { | ||
167 | return huff_dec_input_pos >= huff_dec_encoded_len; | ||
168 | } | ||
169 | |||
170 | |||
171 | int huff_dec_read_byte() | ||
172 | { | ||
173 | return huff_dec_encoded[huff_dec_input_pos++]; | ||
174 | } | ||
175 | |||
176 | |||
177 | void huff_dec_write_byte( char ch ) | ||
178 | { | ||
179 | huff_dec_output[huff_dec_output_pos++] = ch; | ||
180 | } | ||
181 | |||
182 | |||
183 | unsigned char huff_dec_read_code_1_bit() | ||
184 | /* Returned parameters: Returns an unsigned integer with the 0-bit (on the | ||
185 | right of the integer) valid | ||
186 | Action: Reads the next bit in the stream of data to compress | ||
187 | Errors: An input/output error could disturb the running of the program | ||
188 | The source must have enough bits to read | ||
189 | */ | ||
190 | { | ||
191 | if ( huff_dec_byte_nb_to_read ) { | ||
192 | huff_dec_byte_nb_to_read--; | ||
193 | return ( ( huff_dec_val_to_read >> huff_dec_byte_nb_to_read ) & 1 ); | ||
194 | } else { | ||
195 | huff_dec_val_to_read = huff_dec_read_byte(); | ||
196 | huff_dec_byte_nb_to_read = 7; | ||
197 | return ( ( huff_dec_val_to_read >> 7 ) & 1 ); | ||
198 | } | ||
199 | } | ||
200 | |||
201 | |||
202 | unsigned int huff_dec_read_code_n_bits( unsigned int n ) | ||
203 | /* Returned parameters: Returns an unsigned integer with the n-bits (on the | ||
204 | right of the integer) valid | ||
205 | Action: Reads the next n bits in the stream of data to compress | ||
206 | Errors: An input/output error could disturb the running of the program | ||
207 | The source must have enough bits to read | ||
208 | */ | ||
209 | { | ||
210 | unsigned int result = 0; | ||
211 | unsigned i = n; | ||
212 | |||
213 | _Pragma ( "loopbound min 1 max 1" ) | ||
214 | while ( i ) { | ||
215 | _Pragma ( "loopbound min 0 max 2" ) | ||
216 | while ( ( huff_dec_byte_nb_to_read < 9 ) && ( !huff_dec_end_of_data() ) ) { | ||
217 | huff_dec_val_to_read = ( huff_dec_val_to_read << 8 ) + huff_dec_read_byte(); | ||
218 | huff_dec_byte_nb_to_read += 8; | ||
219 | } | ||
220 | if ( i >= huff_dec_byte_nb_to_read ) { | ||
221 | result = ( result << huff_dec_byte_nb_to_read ) + huff_dec_val_to_read; | ||
222 | i -= huff_dec_byte_nb_to_read; | ||
223 | huff_dec_byte_nb_to_read = 0; | ||
224 | } else { | ||
225 | result = ( result << i ) + ( ( huff_dec_val_to_read >> | ||
226 | ( huff_dec_byte_nb_to_read - i ) ) & ( ( 1 << i ) - 1 ) ); | ||
227 | huff_dec_byte_nb_to_read -= i; | ||
228 | i = 0; | ||
229 | } | ||
230 | } | ||
231 | return ( result ); | ||
232 | } | ||
233 | |||
234 | |||
235 | void huff_dec_read_header( t_bin_val codes_table[257] ) | ||
236 | /* Returned parameters: The contain of 'codes_table' is modified | ||
237 | Action: Rebuilds the binary encoding array by using the header | ||
238 | Errors: An input/output error could disturb the running of the program | ||
239 | */ | ||
240 | { | ||
241 | unsigned int i, j; | ||
242 | unsigned num_byte; | ||
243 | |||
244 | _Pragma ( "loopbound min 257 max 257" ) | ||
245 | for ( i = 0; i < 257; i++ ) { | ||
246 | codes_table[i].bits_nb = 0; | ||
247 | _Pragma ( "loopbound min 32 max 32" ) | ||
248 | for ( j = 0; j < 32; j++ ) { | ||
249 | codes_table[i].bits[j] = 0; | ||
250 | } | ||
251 | } | ||
252 | |||
253 | /* == Decoding of the first part of the header === */ | ||
254 | if ( huff_dec_read_code_1_bit() ) | ||
255 | /* First bit=0 => Present bytes coded on n*8 bits | ||
256 | =1 => Present bytes coded on 256 bits */ | ||
257 | _Pragma ( "loopbound min 256 max 256" ) | ||
258 | for ( i = 0; i <= 255; i++ ) | ||
259 | codes_table[i].presence = huff_dec_read_code_1_bit(); | ||
260 | else { | ||
261 | i = huff_dec_read_code_n_bits( 5 ) + 1; | ||
262 | _Pragma ( "loopbound min 1 max 32" ) | ||
263 | while ( i ) { | ||
264 | codes_table[huff_dec_read_code_n_bits( 8 )].presence = 1; | ||
265 | i--; | ||
266 | } | ||
267 | } | ||
268 | codes_table[256].presence = 1; | ||
269 | /* Presence of a fictive 256-byte is enforced! */ | ||
270 | |||
271 | /* == Decoding the second part of the header == */ | ||
272 | _Pragma ( "loopbound min 257 max 257" ) | ||
273 | for ( i = 0; i <= 256; i++ ) | ||
274 | if ( codes_table[i].presence ) { | ||
275 | if ( huff_dec_read_code_1_bit() ) | ||
276 | /* First bit=0 => 5 bits of binary length-1 followed by a binary word | ||
277 | =1 => 8 bits of binary length-1 followed by a binary word */ | ||
278 | j = huff_dec_read_code_n_bits( 8 ) + 1; | ||
279 | else j = huff_dec_read_code_n_bits( 5 ) + 1; | ||
280 | codes_table[i].bits_nb = j; | ||
281 | /* Reading of a binary word */ | ||
282 | num_byte = ( j - 1 ) >> 3; | ||
283 | if ( j & 7 ) { | ||
284 | /* Reads the bits that takes less than one byte */ | ||
285 | codes_table[i].bits[num_byte] = | ||
286 | ( unsigned char )huff_dec_read_code_n_bits( j & 7 ); | ||
287 | j -= j & 7; | ||
288 | num_byte--; | ||
289 | } | ||
290 | |||
291 | _Pragma ( "loopbound min 0 max 1" ) | ||
292 | while ( j >= 8 ) { | ||
293 | /* Reads the bits that takes one byte, at least */ | ||
294 | codes_table[i].bits[num_byte] = | ||
295 | ( unsigned char )huff_dec_read_code_n_bits( 8 ); | ||
296 | j -= 8; | ||
297 | num_byte--; | ||
298 | } | ||
299 | } | ||
300 | } | ||
301 | |||
302 | |||
303 | huff_dec_t_tree *huff_dec_tree_encoding( t_bin_val codes_table[257], | ||
304 | huff_dec_t_tree heap[514] ) | ||
305 | /* Returned parameters: A binary tree is returned | ||
306 | Action: Returns the decoding binary tree built from 'codes_table' | ||
307 | Errors: None | ||
308 | */ | ||
309 | { | ||
310 | unsigned int i; | ||
311 | unsigned int j; | ||
312 | unsigned int heap_top = 0; | ||
313 | huff_dec_t_tree *ptr_tree; | ||
314 | huff_dec_t_tree *current_node; | ||
315 | |||
316 | ptr_tree = &heap[heap_top++]; | ||
317 | ptr_tree->byte = 257; | ||
318 | ptr_tree->left_ptr = 0; | ||
319 | ptr_tree->right_ptr = 0; | ||
320 | _Pragma ( "loopbound min 257 max 257" ) | ||
321 | for ( i = 0; i <= 256; i++ ) { | ||
322 | _Pragma ( "loopbound min 0 max 9" ) | ||
323 | for ( current_node = ptr_tree, j = codes_table[i].bits_nb; j; j-- ) { | ||
324 | if ( codes_table[i].bits[( j - 1 ) >> 3] & ( 1 << ( ( | ||
325 | j - 1 ) & 7 ) ) ) | ||
326 | if ( current_node->left_ptr == 0 ) { | ||
327 | current_node->left_ptr = &heap[heap_top++]; | ||
328 | current_node = current_node->left_ptr; | ||
329 | current_node->left_ptr = 0; | ||
330 | current_node->right_ptr = 0; | ||
331 | } else current_node = current_node->left_ptr; | ||
332 | else | ||
333 | if ( current_node->right_ptr == 0 ) { | ||
334 | current_node->right_ptr = &heap[heap_top++]; | ||
335 | current_node = current_node->right_ptr; | ||
336 | current_node->left_ptr = 0; | ||
337 | current_node->right_ptr = 0; | ||
338 | } else current_node = current_node->right_ptr; | ||
339 | if ( j == 1 ) | ||
340 | current_node->byte = i; | ||
341 | else current_node->byte = 257; | ||
342 | } | ||
343 | }; | ||
344 | return ( ptr_tree ); | ||
345 | } | ||
346 | |||
347 | |||
348 | void _Pragma( "entrypoint" ) huff_dec_main( void ) | ||
349 | /* Returned parameters: None | ||
350 | Action: Decompresses with Huffman method all bytes read by the function | ||
351 | 'read_code_1_bit' and 'read_code_n_bits' | ||
352 | Errors: An input/output error could disturb the running of the program | ||
353 | */ | ||
354 | { | ||
355 | t_bin_val encoding_table[257]; | ||
356 | huff_dec_t_tree heap[514]; /* space for dynamically allocated nodes */ | ||
357 | huff_dec_t_tree *ptr_tree; | ||
358 | huff_dec_t_tree *current_node; | ||
359 | |||
360 | if ( !huff_dec_end_of_data() ) { /* Are there data to compress? */ | ||
361 | huff_dec_read_header( encoding_table ); | ||
362 | ptr_tree = huff_dec_tree_encoding( encoding_table, heap ); | ||
363 | _Pragma ( "loopbound min 602 max 602" ) | ||
364 | do { | ||
365 | current_node = ptr_tree; | ||
366 | _Pragma ( "loopbound min 3 max 9" ) | ||
367 | while ( current_node->byte == 257 ) | ||
368 | if ( huff_dec_read_code_1_bit() ) | ||
369 | /* Bit=1 => Got to left in the node of the tree | ||
370 | =0 => Got to right in the node of the tree */ | ||
371 | current_node = current_node->left_ptr; | ||
372 | else current_node = current_node->right_ptr; | ||
373 | if ( current_node->byte <= 255 ) | ||
374 | huff_dec_write_byte( current_node->byte ); | ||
375 | } while ( current_node->byte != 256 ); | ||
376 | } | ||
377 | } | ||
378 | |||
379 | |||
380 | int main( int argc, char **argv ) | ||
381 | { | ||
382 | //SET_UP | ||
383 | int jobsComplete; | ||
384 | int maxJobs=5; | ||
385 | for (jobsComplete=-1; jobsComplete<maxJobs; jobsComplete++){ | ||
386 | // START_LOOP | ||
387 | huff_dec_init(); | ||
388 | huff_dec_main(); | ||
389 | // STOP_LOOP | ||
390 | } | ||
391 | //WRITE_TO_FILE | ||
392 | return ( huff_dec_return() ); | ||
393 | } | ||