1 files changed, 429 insertions, 0 deletions
diff --git a/editor/pamflip/pamflip_sse.c b/editor/pamflip/pamflip_sse.c
new file mode 100644
index 00000000..e0929f65
--- /dev/null
+++ b/editor/pamflip/pamflip_sse.c
@@ -0,0 +1,429 @@
+/*=============================================================================
+                                   pamflip_sse.c
+===============================================================================
+  This is part of the Pamflip program.  It contains code that exploits
+  the SSE facility of some CPUs.
+
+  This code was originally written by Akira Urushibata ("Douso") in 2010 and is
+  contributed to the public domain by all authors.
+
+  The author makes the following request (which is not a reservation of legal
+  rights): Please study the code and make adjustments to meet specific needs.
+  This part is critical to performance.  I have seen code copied from
+  elsewhere poorly implemented.  A lot of work goes into the development of
+  free software.  It is sad to see derivative work which fails to reach its
+  potential.  Please put a comment in the code so people will know where it
+  came from.
+
+=============================================================================*/
+
+#include <assert.h>
+
+#include "pm_config.h"
+#include "pm_c_util.h"
+#include "mallocvar.h"
+#include "pam.h"
+
+#include "config.h"  /* Defines SSE_PBM_XY_FLIP */
+#include "flip.h"
+
+#include "pamflip_sse.h"
+
+/* Note that WANT_SSE implies the user expects SSE to be available
+   (i.e. <emmintrin.h> exists).
+*/
+
+#if SSE_PBM_XY_FLIP
+
+/*----------------------------------------------------------------------------
+   This is a specialized routine for row-for-column PBM transformations.
+   (-cw, -ccw, -xy).  It requires GCC (>= v. 4.2.0) and SSE2. 
+
+   In each cycle, we read sixteen rows from the input.  We process this band
+   left to right in blocks 8 pixels wide.  We use the SSE2 instruction
+   pmovmskb128, which reports the MSB of each byte in a 16 byte array, for
+   fast processing.  We place the 8x16 block into a 16 byte array, and
+   pmovmskb128 reports the 16 pixels on the left edge in one instruction
+   execution.  pslldi128 shifts the array contents leftward.
+
+   The following routines can write both in both directions (left and right)
+   into the output rows.  They do this by controlling the vertical stacking
+   order when they make the 8x16 blocks.
+
+   We do all transposition in 8x16 block units, adding padding to
+   the 8 row input buffer and the output plane raster as necessary.
+   doPartialBlockTop() or doPartialBlockBottom() handles the partial
+   input band.  This part can come from either the top or bottom of the
+   vertical input column, but always goes to the right end of the output
+   rows.
+
+   As an enhancement, we clear the output raster to zero (=white) in the
+   beginning and flip only the 8x16 blocks that contain non-zero bits (=any
+   amount of black pixels).  When we add padding to the edges, we initialize
+   it all to zero to prevent unnecessary transpositions.  Because most
+   real-world documents are largely white, this saves much execution time.  If
+   you are porting this code to an environment in which non-zero bits are the
+   majority, for example, BMP where zero means black, you should seriously
+   consider modifying this.
+
+   All instructions unique to GCC/SSE are in transpose16Bitrows().
+   It is possible to write a non-SSE version by providing a generic
+   version of transpose16Bitrows() or one tuned for a specific
+   architecture.  Use 8x8 blocks to avoid endian issues.
+ 
+   Further enhancement should be possible by employing wider bands,
+   larger blocks as wider SIMD registers become available.  Clearing
+   the white parts after instead of before transposition is also a
+   possibility.
+-----------------------------------------------------------------------------*/
+
+#include <emmintrin.h>
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef int  v4di  __attribute__ ((vector_size (16)));
+
+/* Beware when making modifications to code which involve SSE.
+   This is a sensitive part of GCC.  Different compiler versions
+   respond differently to trivial matters such as the difference
+   between above v16qi, v4di and a union defined for handling both.
+   What can be placed into a register is another issue.  Some
+   compilers issue warnings, others abort with error.
+
+   A char array cannot be loaded into v16qi by casting.  A vector
+   variable must be vector from the beginning.
+
+   Changes for your local system are okay, but if you intend to
+   publish them, please specify the compiler version you used.
+
+   This code has been tested on gcc versions 4.2.0, 4.2.4, 4.3.2,
+   4.4.3, 4.4.4, 4.5.0, 4.5.2, 4.6.0 and 4.6.1 clang versions
+   3.0, 3.2, 3.3.
+
+   We use SSE instructions in "_mm_" form in favor of "__builtin_".
+   In GCC the "__builtin_" form is documented but "_mm_" is not.
+   Former versions of this source file used "__builtin_".  This was
+   changed to make possible compilation with clang.
+
+   _mm_slli_epi32 : __builtin_ia32_pslldi128
+   _mm_cmpeq_epi8 : __builtin_ia32_pcmpeqb128
+   _mm_movemask_epi8 : __builtin_ia32_pmovmskb128
+
+   The conversion requires <emmintrin.h> .
+
+*/
+
+
+
+static void
+transpose16Bitrows(unsigned int const cols,
+                   unsigned int const rows,
+                   const bit *  const block[16],
+                   uint16_t **  const outplane,
+                   unsigned int const outcol16) {
+/*--------------------------------------------------------------------------
+  Convert input rows to output columns.  Works in units of 8x16.
+
+  Uses pre-calculated pointers ( block[i][col8] ) instead of
+  (xdir > 0) ? (i & 0x08) + 7 - (i & 0x07) : (24 - rows % 16 +i) % 16
+  for efficiency.
+
+  We load the block directly into a register.  (using a union like:
+
+       union V16 {
+          v16qi v;
+          unsigned char i[16];
+       };
+  )
+
+  gcc (v. 4.2, 4.4) sees the suffix [x] of v16.i[x] and apparently decides
+  that the variable has to be addressable and therefore needs to be placed
+  into memory.)
+---------------------------------------------------------------------------*/
+    unsigned int col;
+    register v16qi zero128;   /* 16 bytes of zero, in a SSE register */
+
+    zero128 = zero128 ^ zero128;
+
+    for (col = 0; col < cols; col += 8) {
+        unsigned int const col8 = col / 8;
+
+        register v16qi vReg = {
+            block[0][col8],  block[1][col8],
+            block[2][col8],  block[3][col8],  
+            block[4][col8],  block[5][col8],
+            block[6][col8],  block[7][col8],
+            block[8][col8],  block[9][col8],
+            block[10][col8], block[11][col8],
+            block[12][col8], block[13][col8],
+            block[14][col8], block[15][col8] };
+
+        register __m128i const compare =
+            _mm_cmpeq_epi8((__m128i)vReg, (__m128i)zero128);
+
+        if (_mm_movemask_epi8(compare) != 0xffff) {
+
+            /* There is some black content in this block; write to outplane */
+            
+            unsigned int outrow;
+            unsigned int i;
+
+            outrow = col;  /* initial value */
+
+            for (i = 0; i < 7; ++i) {
+                /* GCC (>=4.2) automatically unrolls this loop */  
+                outplane[outrow++][outcol16] =
+                    _mm_movemask_epi8((__m128i)vReg);
+                vReg = (v16qi)_mm_slli_epi32((__m128i)vReg, 1);
+            }
+            outplane[outrow][outcol16] = _mm_movemask_epi8((__m128i)vReg);
+        } else {
+            /* The block is completely white; skip. */
+        }
+    }
+}
+
+
+
+static void
+analyzeBlock(const struct pam * const inpamP,
+             bit **             const inrow,
+             int                const xdir,
+             const bit **       const block,
+             const bit **       const blockPartial,
+             unsigned int *     const topOfFullBlockP,
+             unsigned int *     const outcol16P) {
+/*--------------------------------------------------------------------------
+  Set up block[] pointer array.  Provide for both directions and the two
+  "twists" brought about by Intel byte ordering which occur when:
+    (1) 16 bytes are loaded to a SSE register
+    (2) 16 bits are written to memory.
+ 
+  If 'rows' is not a multiple of 8, a partial input band appears at one edge.
+  Set *topOfFullBlockP accordingly.  blockPartial[] is an adjusted "block" for
+  this partial band, brought up to a size of 8 rows.  The extra pointers point
+  to a single row which doPartialBlockTop() and doPartialBlockBottom() clear
+  to white.
+---------------------------------------------------------------------------*/
+    if (xdir > 0){
+        /* Write output columns left to right */
+        unsigned int i;
+        for (i = 0; i < 16; ++i){
+            unsigned int const index = (i & 0x8) + 7 - (i & 0x7);
+            /* Absorb little-endian "twists" */
+            block[i] = inrow[index];
+            blockPartial[i] = index < inpamP->height%16 ? block[i] : inrow[15];
+        }
+        *topOfFullBlockP = 0;
+        *outcol16P = 0;
+    } else {
+        /* Write output columns right to left */
+        unsigned int i;
+        for (i = 0; i < 16; ++i){
+            unsigned int const index = ((i & 0x8) ^ 0x8) + (i & 0x7);
+            /* Absorb little-endian "twists" */
+            block[i]= inrow[index];
+            blockPartial[i] = index < (16-inpamP->height%16)
+                ? inrow[0]
+                : block[i];
+        }
+        *topOfFullBlockP = inpamP->height % 16;
+    
+        if (inpamP->height >= 16) {
+            *outcol16P = inpamP->height/16 - 1;
+        } else
+            *outcol16P = 0;
+    }
+}
+
+
+
+static void
+doPartialBlockTop(const struct pam * const inpamP,
+                  bit **             const inrow,
+                  const bit *        const blockPartial[16],
+                  unsigned int       const topOfFullBlock,
+                  uint16_t **        const outplane) {
+    
+    if (topOfFullBlock > 0) {
+        unsigned int colChar, row;
+        unsigned int pad = 16 - topOfFullBlock;
+
+        for (colChar=0; colChar < pbm_packed_bytes(inpamP->width); ++colChar)
+            inrow[0][colChar] = 0x00;
+
+        for (row = 0; row < topOfFullBlock; ++row){
+            pbm_readpbmrow_packed(inpamP->file, inrow[row+pad],
+                                  inpamP->width, inpamP->format);
+            if (inpamP->width % 8 > 0){
+                /* Clear partial byte at end of input row */
+                int const lastByte = pbm_packed_bytes(inpamP->width) -1;
+
+                inrow[row+pad][lastByte] >>= (8 - inpamP->width % 8);
+                inrow[row+pad][lastByte] <<= (8 - inpamP->width % 8);
+            }
+        }
+
+        transpose16Bitrows(inpamP->width, inpamP->height, blockPartial,
+                           outplane, inpamP->height /16);
+            /* Transpose partial rows on top of input.  Place on right edge of
+               output.
+            */ 
+    }
+}
+
+
+
+static void
+doFullBlocks(const struct pam * const inpamP,
+             bit **             const inrow,
+             int                const xdir,
+             const bit *        const block[16],
+             unsigned int       const topOfFullBlock,
+             unsigned int       const initOutcol16,
+             uint16_t **        const outplane) {
+
+    unsigned int row;
+    unsigned int outcol16;
+    unsigned int modrow;
+    /* Number of current row within buffer */
+
+    for (row = topOfFullBlock, outcol16 = initOutcol16, modrow = 0;
+         row < inpamP->height;
+         ++row) {
+
+        pbm_readpbmrow_packed(inpamP->file, inrow[modrow],
+                              inpamP->width, inpamP->format);
+        if (inpamP->width % 8 > 0) {
+            /* Clear partial byte at end of input row */
+            int const lastByte = pbm_packed_bytes(inpamP->width) - 1;
+            inrow[modrow][lastByte] >>= (8 - inpamP->width % 8);
+            inrow[modrow][lastByte] <<= (8 - inpamP->width % 8);
+        }
+
+        ++modrow;
+        if (modrow == 16) {
+            /* 16 row buffer is full.  Transpose. */
+            modrow = 0; 
+
+            transpose16Bitrows(inpamP->width, inpamP->height,
+                               block, outplane, outcol16);
+            outcol16 += xdir;
+        }
+    }
+}
+
+
+
+static void
+doPartialBlockBottom(const struct pam * const inpamP,
+                     bit **             const inrow,
+                     int                const xdir,
+                     const bit *        const blockPartial[16],
+                     uint16_t **        const outplane) {
+    
+    if (xdir > 0 && inpamP->height % 16 > 0) {
+        unsigned int colChar;
+
+        for (colChar=0; colChar < pbm_packed_bytes(inpamP->width); ++colChar)
+            inrow[15][colChar] = 0x00;
+
+        transpose16Bitrows(inpamP->width, inpamP->height, blockPartial,
+                           outplane, inpamP->height/16);
+            /* Transpose partial rows on bottom of input.  Place on right edge
+               of output.
+            */ 
+    }
+}
+
+
+
+static void
+writeOut(const struct pam * const outpamP,
+         uint16_t **        const outplane,
+         int                const ydir) {
+             
+    unsigned int row;
+
+    for (row = 0; row < outpamP->height; ++row) {
+        unsigned int const outrow = (ydir > 0) ?
+            row :
+            outpamP->height - row - 1;  /* reverse order */
+
+        pbm_writepbmrow_packed(stdout, (bit *)outplane[outrow],
+                               outpamP->width, 0);
+    }
+}
+
+
+static void
+clearOutplane(const struct pam * const outpamP,
+              uint16_t **        const outplane) { 
+    
+    unsigned int row;
+    
+    for (row = 0; row < outpamP->height; ++row) {
+        unsigned int col16;  /* column divided by 16 */
+        for (col16 = 0; col16 < (outpamP->width + 15)/16; ++col16)
+            outplane[row][col16] = 0x0000;
+    }
+} 
+
+
+
+void
+pamflip_transformRowsToColumnsPbmSse(const struct pam * const inpamP,
+                                     const struct pam * const outpamP,
+                                     struct xformCore const xformCore) { 
+/*----------------------------------------------------------------------------
+  This is a specialized routine for row-for-column PBM transformations.
+  (-cw, -ccw, -xy).
+-----------------------------------------------------------------------------*/
+    int const xdir = xformCore.c;
+        /* Input top  => output left (+1)/ right (-1)  */
+    int const ydir = xformCore.b;
+        /* Input left => output top  (+1)/ bottom (-1) */
+    int const blocksPerRow = ((unsigned int) outpamP->width + 15) /16;
+
+    bit ** inrow;
+    uint16_t ** outplane;
+    const bit * block[16];
+    const bit * blockPartial[16];
+    unsigned int topOfFullBlock;
+    unsigned int outcol16;
+
+    inrow = pbm_allocarray_packed( inpamP->width, 16);
+    MALLOCARRAY2(outplane, outpamP->height + 7, blocksPerRow);
+    if (outplane == NULL)
+        pm_error("Could not allocate %u x %u array of 16 bit units",
+                 blocksPerRow, outpamP->height + 7);
+
+    /* We write to the output array in 16 bit units.  Add margin. */  
+
+    clearOutplane(outpamP, outplane);
+
+    analyzeBlock(inpamP, inrow, xdir, block, blockPartial, 
+                 &topOfFullBlock, &outcol16);
+
+    doPartialBlockTop(inpamP, inrow, blockPartial, topOfFullBlock, outplane);
+
+    doFullBlocks(inpamP, inrow, xdir, block,
+                 topOfFullBlock, outcol16, outplane);
+
+    doPartialBlockBottom(inpamP, inrow, xdir, blockPartial, outplane);
+
+    writeOut(outpamP, outplane, ydir);
+
+    pbm_freearray(outplane, outpamP->height + 7);
+    pbm_freearray(inrow, 16);
+}
+#else  /* WANT_SSE */
+
+void
+pamflip_transformRowsToColumnsPbmSse(const struct pam * const inpamP,
+                                     const struct pam * const outpamP,
+                                     struct xformCore   const xformCore) { 
+
+    /* Nobody is supposed to call this */
+    assert(false);
+}
+#endif