/*============================================================================= runlength.c =============================================================================== "Packbits" run-length encoding and variants. Copyright (C) 2015 by Akira Urushibata (afu@wta.att.ne.jp). Permission to use, copy, modify, and distribute this software and its documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appear in all copies and that both that copyright notice and this permission notice appear in supporting documentation. This software is provided "as is" without express or implied warranty. Functions pm_rlenc_compressbyte() and pm_rlenc_compressword() are based on algorithm originally by Robert A. Knop (rknop@mop.caltech.edu). Those who wish to incorporate the code herein into their own programs are strongly discouraged from removing the comments within borders consisting of "+" marks. This is a practical consideration based on code inspection and bug fixes of various programs which use run-length compression. =============================================================================== Packbits is a simple yet efficient simple run-length compression method. It has a provision for uncompressed sequences which allows efficient encoding of noisy input. A survey of netpbm source code in 2015 found Packbits encoding in the following Netpbm programs: pbmtoescp2, pbmtomacp, pnmtopalm, pnmtopclxl, pnmtops, ppmtoilbm and ppmtopjxl. Packbits is an option in the TIFF standard; pamtotiff can generate TIFF images that use Packbits compression, but does so via code in the TIFF library (not part of Netpbm). Variants of Packbits are employed in pnmtosgi, pamtopdbimg, pbmtoppa pbmtogo and pamtotga. All the above programs formerly did the same compression through different code. This redundancy bloated the Netpbm package and made maintenance difficult. This maintenance difficulty surfaced as an issue when tests with valgrind revealed multiple memory-related problems in the above programs. Indeed, just determining which programs do compression by this method was not simple because of differences in terminology. "Packbits" is often called "run length encoding." In ppmtoilbm the name is "byterun1." Today, all Netpbm programs that do Packbits compression with the exception of pamtotiff, pbmtoppa, pbmtogo and pamtotga use the facilities in this file for it. =============================================================================*/ #include #include "pm.h" #include "pm_c_util.h" #include "runlength.h" #include "mallocvar.h" static const char * const errorUndefinedMode = "Internal error: compression mode %u not supported"; /*----------------------------------------------------------------------------- Run length encoding In this simple run-length encoding scheme, compressed and uncompressed strings follow a single index or "flag" byte N. There are several variations, differing in the format of the flag byte, maximum string length and element size (byte or word). In the most widely used version, Packbits, the meaning of the flag byte N is defined as follows: 0-127 means the next N+1 bytes are uncompressed. 129-255 means the next byte is to be repeated 257-N times. 128 is not used. The name "Packbits" is misleading: it packs bytes, not bits. -----------------------------------------------------------------------------*/ void pm_rlenc_compressbyte(const unsigned char * const inbuf, unsigned char * const outbuf, enum pm_RleMode const mode, size_t const inSize, size_t * const outputSizeP) { /*----------------------------------------------------------------------------- Compress the contents of input buffer 'inbuf' with Packbits encoding into output buffer 'outbuf'. 'inSize' is the number of bytes of data in 'inbuf'. Return as *outputSizeP the number of bytes we put in 'outbuf'. 'outbuf' should be allocated with pm_rlenc_allocoutbuf(). Always encode 3-byte repeat sequences. Encode 2-byte repeat sequences only when they are at the start of the block. This ensures that the output is never unnecessarily bloated. Original algorithm by Robert A. Knop (rknop@mop.caltech.edu) -----------------------------------------------------------------------------*/ unsigned int const maxRun = 128; size_t inCurs, outCurs; int packBase; int packSign; switch (mode) { case PM_RLE_PACKBITS: packBase = 257 ; packSign = -1; break; case PM_RLE_PALMPDB: packBase = 127 ; packSign = +1; break; default: pm_error(errorUndefinedMode, mode); } for (inCurs = 0, outCurs = 0; inCurs < inSize; ) { if ((inCurs < inSize - 1) && (inbuf[inCurs] == inbuf[inCurs+1])) { /*Begin replicate run*/ size_t const hold = inCurs; size_t count; for (count = 0; inCurs < inSize && inbuf[inCurs] == inbuf[hold] && count < maxRun; ++inCurs, ++count) ; outbuf[outCurs++] = (unsigned char) (packBase + packSign * count); outbuf[outCurs++] = inbuf[hold]; } else { /*Do a non-run*/ size_t const hold = outCurs; size_t count; ++outCurs; count = 0; while(((inCurs + 2 >= inSize) && (inCurs < inSize) ) || ((inCurs + 2 < inSize) && ((inbuf[inCurs] != inbuf[inCurs+1] ) || (inbuf[inCurs] != inbuf[inCurs+2] ) ) ) ) { outbuf[outCurs++] = inbuf[inCurs++]; if (++count >= 128) break; } outbuf[hold] = (unsigned char)(count - 1); } } *outputSizeP = outCurs; } static void setFlagElement(void * const outP, enum pm_RleMode const mode, bool const isRepeatRun, size_t const count) { /*--------------------------------------------------------------------------- Write the flag byte or word at specified point in the output buffer. -----------------------------------------------------------------------------*/ switch (mode) { case PM_RLE_SGI16: * (uint16_t *) outP = (isRepeatRun ? 0x00 : 0x80 ) | count; break; case PM_RLE_PALM16: * (unsigned char *) outP = isRepeatRun ? (unsigned char)(257 - count) : (unsigned char) (count - 1); break; default: pm_error(errorUndefinedMode, mode); } } void pm_rlenc_compressword(const uint16_t * const inbuf, unsigned char * const outbuf, enum pm_RleMode const mode, size_t const inSize, size_t * const outputSizeP) { /*--------------------------------------------------------------------------- Similar to pm_rlenc_compressbyte(), but this works with two-byte elements. The difference between SGI16 and PALM16 is the size of the flag. SGI16 : 16 bits; PALM16 : 8 bits 'inSize' is a number of words, but *outputSizeP is a number of bytes. -----------------------------------------------------------------------------*/ size_t inCurs, outCurs; size_t flagSz; unsigned int maxRunSz; switch (mode) { case PM_RLE_SGI16: flagSz = 2; maxRunSz = 127; break; case PM_RLE_PALM16: flagSz = 1; maxRunSz = 128; break; default: pm_error(errorUndefinedMode, mode); } for (inCurs = 0, outCurs = 0; inCurs < inSize; ) { if ((inCurs < inSize - 1) && (inbuf[inCurs] == inbuf[inCurs+1])) { uint16_t const runValue = inbuf[inCurs]; size_t count; /* Do a run of 'runValue' values */ for (count = 0; count < maxRunSz && inCurs < inSize && inbuf[inCurs] == runValue; ++inCurs, ++count) ; setFlagElement(&outbuf[outCurs], mode, TRUE, count); outCurs += flagSz; * (uint16_t *) &outbuf[outCurs] = runValue; outCurs += 2; } else { /*Do a non run*/ size_t const nonrunStart = inCurs; size_t count; count = 0; while (count < maxRunSz && ((inCurs + 2 >= inSize && inCurs < inSize) || (inCurs + 2 < inSize && (inbuf[inCurs] != inbuf[inCurs+1] || inbuf[inCurs] != inbuf[inCurs+2])))) { ++inCurs; ++count; } setFlagElement(&outbuf[outCurs], mode, FALSE, count); outCurs += flagSz; memcpy(&outbuf[outCurs], &inbuf[nonrunStart], count * 2); outCurs += count * 2; } } if (mode == PM_RLE_SGI16) { * (uint16_t *) &outbuf[outCurs] = 0; /* terminator */ outCurs += 2; } *outputSizeP = outCurs; } /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Worst case output size The term "worst-case output size" can mean one of two things depending on whether the encoder is efficient or not. Sub-optimal encoder: the output size can be up to twice the input size. This happens (or one may rather say "is achieved by a determined encoder") when input is split into one-byte blocks. Efficient encoder: the output is larger than the input by the minimum number of flag bytes. The worst case happens when there are no repeat sequences in the input. The encoder in this file is an efficient one. The key to an efficient encoder is correct handling of short, inefficient sequences. The following algorithm (not actually used in this file) describes the idea. A run is a maximal set of two or more consecutive identical values in the input. A nonrun is a maximal set of values in which every value is different from its neighbors. A compressed block is one that encodes a sequence of identical values (which could be a run or just part of a very long run) as a value and a count. An uncompressed block is one that encodes a sequence of any values by listing the values individually. Start by encoding the entire input as uncompressed blocks. Seek runs that can be encoded with compressed blocks, but only if doing so doesn't make the output larger. Criteria to avoid bloat: - Overhead for a single uncompressed block: 1 byte. - Overhead for one uncompressed block and a compressed block: 2 bytes. - Overhead for two uncompressed blocks and a compressed block: 3 bytes. - New blocks at the edge of any existing block add 1 byte of overhead. New blocks in the middle of existing blocks add 2 bytes of overhead. - Whenever savings are larger than the added overhead, encode the run as a compressed block. ----------------------------------------------------------------------- MS-DOS/Windows BMP format note The compression method for BMP is a variant of packbits. We could support the 8-bit version with some modifications to functions in this file. Determining the worst-case output size of an efficient coder is rather complicated because uncompressed blocks may not be less than three bytes long and are indicated by two-byte flags. +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/ size_t pm_rlenc_maxbytes(size_t const inSize, /* number of elements */ enum pm_RleMode const mode) { /*--------------------------------------------------------------------------- Calculate worst case output size from input size and encoding mode. 'inSize' counts the number of elements, not bytes: input size is (2 * inSize) bytes if input is an array of 16-bit words. Return value is the maximum possible output size in bytes regardless of type of input. Abort the program if the maximum possible output size is greater than INT_MAX. -----------------------------------------------------------------------------*/ /* The upper limit could be raised above INT_MAX, but no program needs that today. */ size_t blockSize; /* Number of items in data block */ size_t flagSize; /* Size of flag, in bytes */ size_t itemSize; /* Size of item, in bytes */ size_t miscSize; /* Size of other elements such as term code, in bytes */ size_t overhead; /* Worst-case overhead, in bytes */ /* return value: Worst-case output size, in bytes */ switch (mode) { case PM_RLE_PACKBITS: case PM_RLE_PALMPDB: blockSize = 128; flagSize = 1; itemSize = 1; miscSize = 0; break; case PM_RLE_SGI8: blockSize = 127; flagSize = 1; itemSize = 1; miscSize = 0; break; case PM_RLE_GRAPHON: case PM_RLE_PPA: blockSize = 64; flagSize = 1; itemSize = 1; miscSize = 0; break; case PM_RLE_SGI16: blockSize = 127; flagSize = 2; itemSize = 2; miscSize = 2; break; case PM_RLE_PALM16: blockSize = 128; flagSize = 1; itemSize = 2; miscSize = 0; break; default: pm_error(errorUndefinedMode, mode); } overhead = miscSize + (inSize / blockSize + (inSize % blockSize > 0 ? 1 : 0) ) * flagSize; if (inSize > INT_MAX / itemSize || inSize * itemSize > INT_MAX - overhead) pm_error("Cannot do RLE compression. Input too large."); return inSize * itemSize + overhead; } void pm_rlenc_allocoutbuf(unsigned char ** const outbufP, size_t const inSize, /* element count */ enum pm_RleMode const mode) { /*--------------------------------------------------------------------------- Allocate an output buffer sufficient for input with inSize elements, using compression mode 'mode'. Element may be byte or word, whichever 'mode' implies. -----------------------------------------------------------------------------*/ size_t const size = pm_rlenc_maxbytes(inSize, mode); unsigned char * outbuf; MALLOCARRAY(outbuf, size); if (outbuf == NULL) pm_error("Out of memory trying to get %u bytes for RLE output buffer", (unsigned)size); *outbufP = outbuf; } void pm_rlenc_freebuf(void * const buf) { free(buf); }