1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
|
/*=============================================================================
runlength.c
===============================================================================
"Packbits" run-length encoding and variants.
Copyright (C) 2015 by Akira Urushibata (afu@wta.att.ne.jp).
Permission to use, copy, modify, and distribute this software and its
documentation for any purpose and without fee is hereby granted, provided
that the above copyright notice appear in all copies and that both that
copyright notice and this permission notice appear in supporting
documentation. This software is provided "as is" without express or implied
warranty.
Functions pm_rlenc_compressbyte() and pm_rlenc_compressword() are based
on algorithm originally by Robert A. Knop (rknop@mop.caltech.edu).
Those who wish to incorporate the code herein into their own programs are
strongly discouraged from removing the comments within borders consisting
of "+" marks. This is a practical consideration based on code inspection
and bug fixes of various programs which use run-length compression.
===============================================================================
Packbits is a simple yet efficient simple run-length compression method.
It has a provision for uncompressed sequences which allows efficient
encoding of noisy input.
A survey of netpbm source code in 2015 found Packbits encoding in the
following Netpbm programs: pbmtoescp2, pbmtomacp, pnmtopalm, pnmtopclxl,
pnmtops, ppmtoilbm and ppmtopjxl.
Packbits is an option in the TIFF standard; pamtotiff can generate TIFF
images that use Packbits compression, but does so via code in the TIFF
library (not part of Netpbm).
Variants of Packbits are employed in pnmtosgi, pamtopdbimg, pbmtoppa
pbmtogo and pamtotga.
All the above programs formerly did the same compression through different
code. This redundancy bloated the Netpbm package and made maintenance
difficult. This maintenance difficulty surfaced as an issue when tests with
valgrind revealed multiple memory-related problems in the above programs.
Indeed, just determining which programs do compression by this method
was not simple because of differences in terminology. "Packbits" is often
called "run length encoding." In ppmtoilbm the name is "byterun1."
Today, all Netpbm programs that do Packbits compression with the exception
of pamtotiff, pbmtoppa, pbmtogo and pamtotga use the facilities in this
file for it.
=============================================================================*/
#include <string.h>
#include "pm.h"
#include "pm_c_util.h"
#include "runlength.h"
#include "mallocvar.h"
static const char * const errorUndefinedMode =
"Internal error: compression mode %u not supported";
/*-----------------------------------------------------------------------------
Run length encoding
In this simple run-length encoding scheme, compressed and uncompressed
strings follow a single index or "flag" byte N. There are several
variations, differing in the format of the flag byte, maximum string
length and element size (byte or word).
In the most widely used version, Packbits, the meaning of the flag byte
N is defined as follows:
0-127 means the next N+1 bytes are uncompressed.
129-255 means the next byte is to be repeated 257-N times.
128 is not used.
The name "Packbits" is misleading: it packs bytes, not bits.
-----------------------------------------------------------------------------*/
void
pm_rlenc_compressbyte(const unsigned char * const inbuf,
unsigned char * const outbuf,
enum pm_RleMode const mode,
size_t const inSize,
size_t * const outputSizeP) {
/*-----------------------------------------------------------------------------
Compress the contents of input buffer 'inbuf' with Packbits encoding into
output buffer 'outbuf'. 'inSize' is the number of bytes of data in 'inbuf'.
Return as *outputSizeP the number of bytes we put in 'outbuf'.
'outbuf' should be allocated with pm_rlenc_allocoutbuf().
Always encode 3-byte repeat sequences.
Encode 2-byte repeat sequences only when they are at the start of the block.
This ensures that the output is never unnecessarily bloated.
Original algorithm by Robert A. Knop (rknop@mop.caltech.edu)
-----------------------------------------------------------------------------*/
unsigned int const maxRun = 128;
size_t inCurs, outCurs;
int packBase;
int packSign;
switch (mode) {
case PM_RLE_PACKBITS:
packBase = 257 ; packSign = -1; break;
case PM_RLE_PALMPDB:
packBase = 127 ; packSign = +1; break;
default:
pm_error(errorUndefinedMode, mode);
}
for (inCurs = 0, outCurs = 0; inCurs < inSize; ) {
if ((inCurs < inSize - 1) && (inbuf[inCurs] == inbuf[inCurs+1])) {
/*Begin replicate run*/
size_t const hold = inCurs;
size_t count;
for (count = 0;
inCurs < inSize &&
inbuf[inCurs] == inbuf[hold] &&
count < maxRun;
++inCurs, ++count)
;
outbuf[outCurs++] = (unsigned char) (packBase + packSign * count);
outbuf[outCurs++] = inbuf[hold];
} else {
/*Do a non-run*/
size_t const hold = outCurs;
size_t count;
++outCurs;
count = 0;
while(((inCurs + 2 >= inSize) && (inCurs < inSize) ) ||
((inCurs + 2 < inSize) &&
((inbuf[inCurs] != inbuf[inCurs+1] )
|| (inbuf[inCurs] != inbuf[inCurs+2] ) ) ) ) {
outbuf[outCurs++] = inbuf[inCurs++];
if (++count >= 128)
break;
}
outbuf[hold] = (unsigned char)(count - 1);
}
}
*outputSizeP = outCurs;
}
static void
setFlagElement(void * const outP,
enum pm_RleMode const mode,
bool const isRepeatRun,
size_t const count) {
/*---------------------------------------------------------------------------
Write the flag byte or word at specified point in the output buffer.
-----------------------------------------------------------------------------*/
switch (mode) {
case PM_RLE_SGI16:
* (uint16_t *) outP = (isRepeatRun ? 0x00 : 0x80 ) | count;
break;
case PM_RLE_PALM16:
* (unsigned char *) outP = isRepeatRun ?
(unsigned char)(257 - count) : (unsigned char) (count - 1);
break;
default:
pm_error(errorUndefinedMode, mode);
}
}
void
pm_rlenc_compressword(const uint16_t * const inbuf,
unsigned char * const outbuf,
enum pm_RleMode const mode,
size_t const inSize,
size_t * const outputSizeP) {
/*---------------------------------------------------------------------------
Similar to pm_rlenc_compressbyte(), but this works with two-byte elements.
The difference between SGI16 and PALM16 is the size of the flag.
SGI16 : 16 bits; PALM16 : 8 bits
'inSize' is a number of words, but *outputSizeP is a number of bytes.
-----------------------------------------------------------------------------*/
size_t inCurs, outCurs;
size_t flagSz;
unsigned int maxRunSz;
switch (mode) {
case PM_RLE_SGI16:
flagSz = 2;
maxRunSz = 127;
break;
case PM_RLE_PALM16:
flagSz = 1;
maxRunSz = 128;
break;
default:
pm_error(errorUndefinedMode, mode);
}
for (inCurs = 0, outCurs = 0; inCurs < inSize; ) {
if ((inCurs < inSize - 1) && (inbuf[inCurs] == inbuf[inCurs+1])) {
uint16_t const runValue = inbuf[inCurs];
size_t count;
/* Do a run of 'runValue' values */
for (count = 0;
count < maxRunSz &&
inCurs < inSize &&
inbuf[inCurs] == runValue;
++inCurs, ++count)
;
setFlagElement(&outbuf[outCurs], mode, TRUE, count);
outCurs += flagSz;
* (uint16_t *) &outbuf[outCurs] = runValue;
outCurs += 2;
} else {
/*Do a non run*/
size_t const nonrunStart = inCurs;
size_t count;
count = 0;
while (count < maxRunSz &&
((inCurs + 2 >= inSize && inCurs < inSize) ||
(inCurs + 2 < inSize &&
(inbuf[inCurs] != inbuf[inCurs+1]
|| inbuf[inCurs] != inbuf[inCurs+2])))) {
++inCurs;
++count;
}
setFlagElement(&outbuf[outCurs], mode, FALSE, count);
outCurs += flagSz;
memcpy(&outbuf[outCurs], &inbuf[nonrunStart], count * 2);
outCurs += count * 2;
}
}
if (mode == PM_RLE_SGI16) {
* (uint16_t *) &outbuf[outCurs] = 0; /* terminator */
outCurs += 2;
}
*outputSizeP = outCurs;
}
/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Worst case output size
The term "worst-case output size" can mean one of two things depending on
whether the encoder is efficient or not.
Sub-optimal encoder: the output size can be up to twice the input size.
This happens (or one may rather say "is achieved by a determined encoder")
when input is split into one-byte blocks.
Efficient encoder: the output is larger than the input by the minimum
number of flag bytes. The worst case happens when there are no repeat
sequences in the input.
The encoder in this file is an efficient one.
The key to an efficient encoder is correct handling of short, inefficient
sequences. The following algorithm (not actually used in this file)
describes the idea.
A run is a maximal set of two or more consecutive identical values in the
input. A nonrun is a maximal set of values in which every value is
different from its neighbors.
A compressed block is one that encodes a sequence of identical values
(which could be a run or just part of a very long run) as a value and
a count. An uncompressed block is one that encodes a sequence of any
values by listing the values individually.
Start by encoding the entire input as uncompressed blocks. Seek runs that
can be encoded with compressed blocks, but only if doing so doesn't make
the output larger.
Criteria to avoid bloat:
- Overhead for a single uncompressed block: 1 byte.
- Overhead for one uncompressed block and a compressed block: 2 bytes.
- Overhead for two uncompressed blocks and a compressed block: 3 bytes.
- New blocks at the edge of any existing block add 1 byte of overhead.
New blocks in the middle of existing blocks add 2 bytes of overhead.
- Whenever savings are larger than the added overhead, encode the run
as a compressed block.
-----------------------------------------------------------------------
MS-DOS/Windows BMP format note
The compression method for BMP is a variant of packbits.
We could support the 8-bit version with some modifications to functions
in this file. Determining the worst-case output size of an efficient
coder is rather complicated because uncompressed blocks may not be less
than three bytes long and are indicated by two-byte flags.
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/
size_t
pm_rlenc_maxbytes(size_t const inSize, /* number of elements */
enum pm_RleMode const mode) {
/*---------------------------------------------------------------------------
Calculate worst case output size from input size and encoding mode.
'inSize' counts the number of elements, not bytes: input size is (2 *
inSize) bytes if input is an array of 16-bit words.
Return value is the maximum possible output size in bytes regardless of
type of input.
Abort the program if the maximum possible output size is greater than
INT_MAX.
-----------------------------------------------------------------------------*/
/* The upper limit could be raised above INT_MAX, but no program needs that
today.
*/
size_t blockSize; /* Number of items in data block */
size_t flagSize; /* Size of flag, in bytes */
size_t itemSize; /* Size of item, in bytes */
size_t miscSize; /* Size of other elements such as term code, in bytes */
size_t overhead; /* Worst-case overhead, in bytes */
/* return value: Worst-case output size, in bytes */
switch (mode) {
case PM_RLE_PACKBITS:
case PM_RLE_PALMPDB:
blockSize = 128; flagSize = 1; itemSize = 1; miscSize = 0;
break;
case PM_RLE_SGI8:
blockSize = 127; flagSize = 1; itemSize = 1; miscSize = 0;
break;
case PM_RLE_GRAPHON:
case PM_RLE_PPA:
blockSize = 64; flagSize = 1; itemSize = 1; miscSize = 0;
break;
case PM_RLE_SGI16:
blockSize = 127; flagSize = 2; itemSize = 2; miscSize = 2;
break;
case PM_RLE_PALM16:
blockSize = 128; flagSize = 1; itemSize = 2; miscSize = 0;
break;
default:
pm_error(errorUndefinedMode, mode);
}
overhead = miscSize +
(inSize / blockSize + (inSize % blockSize > 0 ? 1 : 0) ) * flagSize;
if (inSize > INT_MAX / itemSize ||
inSize * itemSize > INT_MAX - overhead)
pm_error("Cannot do RLE compression. Input too large.");
return inSize * itemSize + overhead;
}
void
pm_rlenc_allocoutbuf(unsigned char ** const outbufP,
size_t const inSize, /* element count */
enum pm_RleMode const mode) {
/*---------------------------------------------------------------------------
Allocate an output buffer sufficient for input with inSize elements, using
compression mode 'mode'. Element may be byte or word, whichever 'mode'
implies.
-----------------------------------------------------------------------------*/
size_t const size = pm_rlenc_maxbytes(inSize, mode);
unsigned char * outbuf;
MALLOCARRAY(outbuf, size);
if (outbuf == NULL)
pm_error("Out of memory trying to get %u bytes for RLE output buffer",
(unsigned)size);
*outbufP = outbuf;
}
void
pm_rlenc_freebuf(void * const buf) {
free(buf);
}
|