lib/util/runlength.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

/*=============================================================================
                                  runlength.c
===============================================================================
  "Packbits" run-length encoding and variants.

  Copyright (C) 2015 by Akira Urushibata (afu@wta.att.ne.jp).

  Permission to use, copy, modify, and distribute this software and its
  documentation for any purpose and without fee is hereby granted, provided
  that the above copyright notice appear in all copies and that both that
  copyright notice and this permission notice appear in supporting
  documentation.  This software is provided "as is" without express or implied
  warranty.

  Functions pm_rlenc_compressbyte() and pm_rlenc_compressword() are based
  on algorithm originally by Robert A. Knop (rknop@mop.caltech.edu).

  Those who wish to incorporate the code herein into their own programs are
  strongly discouraged from removing the comments within borders consisting
  of "+" marks.  This is a practical consideration based on code inspection
  and bug fixes of various programs which use run-length compression.

===============================================================================

  Packbits is a simple yet efficient simple run-length compression method.
  It has a provision for uncompressed sequences which allows efficient
  encoding of noisy input.

  A survey of netpbm source code in 2015 found Packbits encoding in the
  following Netpbm programs: pbmtoescp2, pbmtomacp, pnmtopalm, pnmtopclxl,
  pnmtops, ppmtoilbm and ppmtopjxl.

  Packbits is an option in the TIFF standard; pamtotiff can generate TIFF
  images that use Packbits compression, but does so via code in the TIFF
  library (not part of Netpbm).

  Variants of Packbits are employed in pnmtosgi, pamtopdbimg, pbmtoppa
  pbmtogo and pamtotga.

  All the above programs formerly did the same compression through different
  code.  This redundancy bloated the Netpbm package and made maintenance
  difficult.  This maintenance difficulty surfaced as an issue when tests with
  valgrind revealed multiple memory-related problems in the above programs.
  Indeed, just determining which programs do compression by this method
  was not simple because of differences in terminology.  "Packbits" is often
  called "run length encoding."  In ppmtoilbm the name is "byterun1."

  Today, all Netpbm programs that do Packbits compression with the exception
  of pamtotiff, pbmtoppa, pbmtogo and pamtotga use the facilities in this
  file for it.
=============================================================================*/

#include <string.h>

#include "pm.h"
#include "pm_c_util.h"
#include "runlength.h"
#include "mallocvar.h"


static const char * const errorUndefinedMode =
    "Internal error: compression mode %u not supported";


/*-----------------------------------------------------------------------------
   Run length encoding

   In this simple run-length encoding scheme, compressed and uncompressed
   strings follow a single index or "flag" byte N.  There are several
   variations, differing in the format of the flag byte, maximum string
   length and element size (byte or word).

   In the most widely used version, Packbits, the meaning of the flag byte
   N is defined as follows:

    0-127 means the next N+1 bytes are uncompressed.
    129-255 means the next byte is to be repeated 257-N times.
    128 is not used.

   The name "Packbits" is misleading: it packs bytes, not bits.
-----------------------------------------------------------------------------*/


void
pm_rlenc_compressbyte(const unsigned char * const inbuf,
                      unsigned char       * const outbuf,
                      enum pm_RleMode       const mode,
                      size_t                const inSize,
                      size_t              * const outputSizeP) {
/*-----------------------------------------------------------------------------
  Compress the contents of input buffer 'inbuf' with Packbits encoding into
  output buffer 'outbuf'.  'inSize' is the number of bytes of data in 'inbuf'.
  Return as *outputSizeP the number of bytes we put in 'outbuf'.

  'outbuf' should be allocated with pm_rlenc_allocoutbuf().

  Always encode 3-byte repeat sequences.

  Encode 2-byte repeat sequences only when they are at the start of the block.
  This ensures that the output is never unnecessarily bloated.

  Original algorithm by Robert A. Knop (rknop@mop.caltech.edu)
-----------------------------------------------------------------------------*/
    unsigned int const maxRun = 128;

    size_t inCurs, outCurs;

    int packBase;
    int packSign;

    switch (mode) {
    case PM_RLE_PACKBITS:
        packBase = 257 ; packSign = -1; break;
    case PM_RLE_PALMPDB:
        packBase = 127 ; packSign = +1; break;
    default:
         pm_error(errorUndefinedMode, mode);
    }

    for (inCurs = 0, outCurs = 0; inCurs < inSize; ) {
        if ((inCurs < inSize - 1) && (inbuf[inCurs] == inbuf[inCurs+1])) {
            /*Begin replicate run*/
            size_t const hold = inCurs;
            size_t count;
            for (count = 0;
                 inCurs < inSize &&
                     inbuf[inCurs] == inbuf[hold] &&
                     count < maxRun;
                 ++inCurs, ++count)
                ;
            outbuf[outCurs++] = (unsigned char) (packBase + packSign * count);
            outbuf[outCurs++] = inbuf[hold];
        } else {
            /*Do a non-run*/
            size_t const hold = outCurs;
            size_t count;
            ++outCurs;
            count = 0;
            while(((inCurs + 2 >= inSize) && (inCurs < inSize) ) ||
                  ((inCurs + 2 <  inSize) &&
                   ((inbuf[inCurs] != inbuf[inCurs+1]  )
                    || (inbuf[inCurs] != inbuf[inCurs+2]  ) ) )    ) {
                outbuf[outCurs++] = inbuf[inCurs++];
                if (++count >= 128)
                    break;
            }
            outbuf[hold] = (unsigned char)(count - 1);
        }
    }
    *outputSizeP = outCurs;
}


static void
setFlagElement(void            * const outP,
               enum pm_RleMode   const mode,
               bool              const isRepeatRun,
               size_t            const count) {
/*---------------------------------------------------------------------------
  Write the flag byte or word at specified point in the output buffer.
-----------------------------------------------------------------------------*/
    switch (mode) {
    case PM_RLE_SGI16:
        * (uint16_t *) outP =  (isRepeatRun ? 0x00 : 0x80 ) | count;
        break;
    case PM_RLE_PALM16:
        * (unsigned char *) outP = isRepeatRun ?
            (unsigned char)(257 - count) : (unsigned char) (count - 1);
        break;
    default:
        pm_error(errorUndefinedMode, mode);
    }
}


void
pm_rlenc_compressword(const uint16_t   * const inbuf,
                      unsigned char *    const outbuf,
                      enum pm_RleMode    const mode,
                      size_t             const inSize,
                      size_t           * const outputSizeP) {
/*---------------------------------------------------------------------------
   Similar to pm_rlenc_compressbyte(), but this works with two-byte elements.
   The difference between SGI16 and PALM16 is the size of the flag.
   SGI16 : 16 bits; PALM16 : 8 bits

   'inSize' is a number of words, but *outputSizeP is a number of bytes.
-----------------------------------------------------------------------------*/
    size_t inCurs, outCurs;
    size_t flagSz;
    unsigned int maxRunSz;

    switch (mode) {
    case PM_RLE_SGI16:
        flagSz = 2;
        maxRunSz = 127;
        break;
    case PM_RLE_PALM16:
        flagSz = 1;
        maxRunSz = 128;
        break;
    default:
        pm_error(errorUndefinedMode, mode);
    }

    for (inCurs = 0, outCurs = 0; inCurs < inSize; ) {
        if ((inCurs < inSize - 1) && (inbuf[inCurs] == inbuf[inCurs+1])) {
            uint16_t const runValue = inbuf[inCurs];
            size_t count;
            /* Do a run of 'runValue' values */
            for (count = 0;
                 count < maxRunSz &&
                     inCurs < inSize &&
                     inbuf[inCurs] == runValue;
                 ++inCurs, ++count)
                ;
            setFlagElement(&outbuf[outCurs], mode, TRUE, count);
            outCurs += flagSz;
            * (uint16_t *) &outbuf[outCurs] = runValue;
            outCurs += 2;
        } else {
            /*Do a non run*/
            size_t const nonrunStart = inCurs;
            size_t count;
            count = 0;
            while (count < maxRunSz &&
                   ((inCurs + 2 >= inSize && inCurs < inSize) ||
                    (inCurs + 2 <  inSize &&
                     (inbuf[inCurs] != inbuf[inCurs+1]
                      || inbuf[inCurs] != inbuf[inCurs+2])))) {
                ++inCurs;
                ++count;
            }
            setFlagElement(&outbuf[outCurs], mode, FALSE, count);
            outCurs += flagSz;
            memcpy(&outbuf[outCurs], &inbuf[nonrunStart], count * 2);
            outCurs += count * 2;
        }
    }

    if (mode == PM_RLE_SGI16) {
        * (uint16_t *) &outbuf[outCurs] = 0;     /* terminator */
        outCurs += 2;
    }

    *outputSizeP = outCurs;
}


/*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
   Worst case output size

   The term "worst-case output size" can mean one of two things depending on
   whether the encoder is efficient or not.

   Sub-optimal encoder: the output size can be up to twice the input size.
   This happens (or one may rather say "is achieved by a determined encoder")
   when input is split into one-byte blocks.

   Efficient encoder: the output is larger than the input by the minimum
   number of flag bytes.  The worst case happens when there are no repeat
   sequences in the input.

   The encoder in this file is an efficient one.

   The key to an efficient encoder is correct handling of short, inefficient
   sequences.  The following algorithm (not actually used in this file)
   describes the idea.

   A run is a maximal set of two or more consecutive identical values in the
   input.  A nonrun is a maximal set of values in which every value is
   different from its neighbors.

   A compressed block is one that encodes a sequence of identical values
   (which could be a run or just part of a very long run) as a value and
   a count.  An uncompressed block is one that encodes a sequence of any
   values by listing the values individually.

   Start by encoding the entire input as uncompressed blocks.  Seek runs that
   can be encoded with compressed blocks, but only if doing so doesn't make
   the output larger.

   Criteria to avoid bloat:

     - Overhead for a single uncompressed block: 1 byte.

     - Overhead for one uncompressed block and a compressed block: 2 bytes.

     - Overhead for two uncompressed blocks and a compressed block: 3 bytes.

     - New blocks at the edge of any existing block add 1 byte of overhead.
       New blocks in the middle of existing blocks add 2 bytes of overhead.

     - Whenever savings are larger than the added overhead, encode the run
       as a compressed block.

   -----------------------------------------------------------------------

   MS-DOS/Windows BMP format note

   The compression method for BMP is a variant of packbits.
   We could support the 8-bit version with some modifications to functions
   in this file.  Determining the worst-case output size of an efficient
   coder is rather complicated because uncompressed blocks may not be less
   than three bytes long and are indicated by two-byte flags.
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/


size_t
pm_rlenc_maxbytes(size_t          const inSize,  /* number of elements */
                  enum pm_RleMode const mode) {
/*---------------------------------------------------------------------------
   Calculate worst case output size from input size and encoding mode.

   'inSize' counts the number of elements, not bytes: input size is (2 *
   inSize) bytes if input is an array of 16-bit words.

   Return value is the maximum possible output size in bytes regardless of
   type of input.

   Abort the program if the maximum possible output size is greater than
   INT_MAX.
-----------------------------------------------------------------------------*/
   /* The upper limit could be raised above INT_MAX, but no program needs that
      today.
   */
    size_t blockSize;  /* Number of items in data block */
    size_t flagSize;   /* Size of flag, in bytes */
    size_t itemSize;   /* Size of item, in bytes */
    size_t miscSize;   /* Size of other elements such as term code, in bytes */
    size_t overhead;   /* Worst-case overhead, in bytes */
    /* return value:      Worst-case output size, in bytes */

    switch (mode) {
    case PM_RLE_PACKBITS:
    case PM_RLE_PALMPDB:
        blockSize = 128; flagSize = 1; itemSize = 1; miscSize = 0;
        break;
    case PM_RLE_SGI8:
        blockSize = 127; flagSize = 1; itemSize = 1; miscSize = 0;
        break;
    case PM_RLE_GRAPHON:
    case PM_RLE_PPA:
        blockSize =  64; flagSize = 1; itemSize = 1; miscSize = 0;
        break;
    case PM_RLE_SGI16:
        blockSize = 127; flagSize = 2; itemSize = 2; miscSize = 2;
        break;
    case PM_RLE_PALM16:
        blockSize = 128; flagSize = 1; itemSize = 2; miscSize = 0;
        break;
    default:
        pm_error(errorUndefinedMode, mode);
    }

    overhead = miscSize +
        (inSize / blockSize + (inSize % blockSize > 0 ? 1 : 0) ) * flagSize;

    if (inSize > INT_MAX / itemSize ||
        inSize * itemSize > INT_MAX - overhead)
        pm_error("Cannot do RLE compression.  Input too large.");

    return inSize * itemSize + overhead;
}


void
pm_rlenc_allocoutbuf(unsigned char ** const outbufP,
                     size_t           const inSize,  /* element count */
                     enum pm_RleMode  const mode) {
/*---------------------------------------------------------------------------
   Allocate an output buffer sufficient for input with inSize elements, using
   compression mode 'mode'.  Element may be byte or word, whichever 'mode'
   implies.
-----------------------------------------------------------------------------*/
    size_t const size = pm_rlenc_maxbytes(inSize, mode);

    unsigned char * outbuf;

    MALLOCARRAY(outbuf, size);
    if (outbuf == NULL)
        pm_error("Out of memory trying to get %u bytes for RLE output buffer",
                 (unsigned)size);

    *outbufP = outbuf;
}


void
pm_rlenc_freebuf(void * const buf) {
    free(buf);
}