From 60a0d64307394bad62684c32bbccf89552d15090 Mon Sep 17 00:00:00 2001
From: giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8>
Date: Tue, 18 May 2010 02:37:18 +0000
Subject: Pamflip PBM speedup with SSE et al

git-svn-id: http://svn.code.sf.net/p/netpbm/code/trunk@1214 9d0c8265-081b-0410-96cb-a4ca84ce46f8
---
 editor/pamflip/pamflip_sse.c | 345 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 editor/pamflip/pamflip_sse.c

(limited to 'editor/pamflip/pamflip_sse.c')

diff --git a/editor/pamflip/pamflip_sse.c b/editor/pamflip/pamflip_sse.c
new file mode 100644
index 00000000..3a2d28d7
--- /dev/null
+++ b/editor/pamflip/pamflip_sse.c
@@ -0,0 +1,345 @@
+#include <assert.h>
+
+#include "pm_config.h"
+#include "pm_c_util.h"
+#include "pam.h"
+
+#include "flip.h"
+
+#include "pamflip_sse.h"
+
+#if HAVE_GCC_SSE2 && defined(__SSE2__)
+
+/*----------------------------------------------------------------------------
+   This is a specialized routine for row-for-column PBM transformations.
+   (-cw, -ccw, -xy).  It requires GCC (>= v. 4.0.4) and SSE2. 
+
+   In each cycle, we read sixteen rows from the input.  We process tbis band
+   left to right in blocks 8 pixels wide.  We use the SSE2 instruction
+   pmovmskb128, which reports the MSB of each byte in a 16 byte array, for
+   fast processing.  We place the 8x16 block into a 16 byte array, and
+   pmovmskb128 reports the 16 pixels on the left edge in one instruction
+   execution.  pslldi128 shifts the array contents leftward.
+
+   The following routines can write both in both directions (left and right)
+   into the output rows.  They do this by controlling the vertical stacking
+   order when the make the 8x16 blocks.
+ 
+   Function transpose1to15Bitrows() is for handling the partial bits of each
+   output row.  They can come from either the top or bottom of the vertical
+   input colunm, but they always go to the right end of the output rows.
+
+   transformRowsToColumnsPbm() does not have any instructions unique to
+   GCC or SSE.  It is possible to write a non-SSE version by providing
+   generic versions of transpose16Bitrows() and transpose1to15Bitrows() .
+   This is just a matter of replacing the V16 union with a plain uchar
+   array and writing an emulation for __builtin_pmovmskb128() .
+ 
+   Further enhancement should be possible by employing wider bands,
+   larger blocks as wider SIMD registers become available.  Another
+   method is checking for white blocks and recording them in a small
+   array and condensing writes into the output raster array.
+-----------------------------------------------------------------------------*/
+
+typedef char v16qi __attribute__ ((vector_size (16)));
+typedef int  v4di  __attribute__ ((vector_size (16)));
+
+union V16 {
+    v16qi v;
+    v4di d;
+    unsigned char i[16];
+};
+
+/* Beware when making modifications to code which involve v16qi, v4di, V16.
+   Certain versions of GCC get stuck with the following:
+
+   (1) Type mismatches between v16qi and v4di.  Avoid them with casts.
+
+   (2) Converions from a 16 byte array to v16qi (or union V16) by cast.
+       __vector__ variables have to be vector from the start. 
+
+   (3) union V16 as a register variable.
+
+   Some GCC versions emit warnings, others abort with error.
+*/
+
+
+
+static void
+transpose1to15Bitrows(unsigned int const cols,
+                      unsigned int const rows,
+                      bit **       const inrow,
+                      uint16_t **  const outplane,
+                      int          const xdir) {
+/*--------------------------------------------------------------------------
+  Convert input rows to output columns.  For handling partial rows.
+  Note that output from this always goes to the right edge of the image.
+----------------------------------------------------------------------------*/
+    unsigned int const outcol16 = (rows-1)/16;
+
+    unsigned int col;
+
+    union V16 v16;
+    v16.v = v16.v ^ v16.v;  /* clear to zero */
+
+    for (col = 0; col < cols; ++col) {
+        unsigned int const outrow = col;
+
+        if (col % 8 == 0) {
+            unsigned int i;
+            for (i = 0; i < rows % 16; ++i) {
+                int const idx = (xdir > 0) ?
+                    (i&8) + 7-(i&7) :       /* output left to right */
+                    (24- rows%16 +i) %16;  /*        right to left */
+                v16.i [idx] = inrow[i][col/8];
+            }
+        }
+        /* read 16 bits from left edge of block; write to output column  */
+        outplane[outrow][outcol16] = __builtin_ia32_pmovmskb128(v16.v);
+        v16.d = __builtin_ia32_pslldi128(v16.d, 1);
+    }
+}
+
+
+
+static void
+transpose16Bitrows(unsigned int const cols,
+      	           unsigned int const rows,
+                   const bit *  const block[16],
+                   uint16_t **  const outplane,
+                   unsigned int const outcol16) {
+/*--------------------------------------------------------------------------
+  Convert input rows to output columns.  Works in units of 8x16.
+
+  Uses pre-calculated pointers ( block[i][col8] ) instead of
+  (xdir > 0) ? (i & 0x08) + 7 - (i & 0x07) : (24 - rows % 16 +i) % 16
+  for efficiency.
+
+  We avoid using union V16 to keep the value in a register.  (When we do so,
+  GCC (4.2, 4.4) sees the suffix x of v16.i[x] and apparently decides that
+  the variable has to be addressable and therefore needs to be placed into
+  memory.)
+  ----------------------------------------------------------------------------*/
+    unsigned int col;
+
+    for (col = 0; col + 7 < cols; col += 8) {    /* Unrolled loop */
+        unsigned int const col8 = col / 8;
+
+        unsigned int outrow;
+        unsigned int i;
+
+        register v16qi vReg = {
+            block[0][col8],  block[1][col8],
+            block[2][col8],  block[3][col8],  
+            block[4][col8],  block[5][col8],
+            block[6][col8],  block[7][col8],
+            block[8][col8],  block[9][col8],
+            block[10][col8], block[11][col8],
+            block[12][col8], block[13][col8],
+            block[14][col8], block[15][col8] };
+
+        outrow = col;  /* initial value */
+
+        for (i = 0; i < 7; ++i) {
+            /* GCC (>=4.2) automatically unrolls this loop */  
+            outplane[outrow++][outcol16] = __builtin_ia32_pmovmskb128(vReg);
+            vReg = (v16qi)__builtin_ia32_pslldi128 ((v4di)vReg, 1);
+        }
+        outplane[outrow][outcol16] = __builtin_ia32_pmovmskb128(vReg);
+    }
+
+    if (col < cols) {  /* Transpose remaining pixels at end of input rows. */
+        unsigned int const col8 = col / 8;
+        register v16qi vReg = {
+            block[0][col8],  block[1][col8],
+            block[2][col8],  block[3][col8],  
+            block[4][col8],  block[5][col8],
+            block[6][col8],  block[7][col8],
+            block[8][col8],  block[9][col8],
+            block[10][col8], block[11][col8],
+            block[12][col8], block[13][col8],
+            block[14][col8], block[15][col8] };
+
+        for ( ; col < cols; ++col) { 
+            unsigned int const outrow = col;
+
+            outplane[outrow][outcol16] = __builtin_ia32_pmovmskb128(vReg);
+            vReg = (v16qi)__builtin_ia32_pslldi128((v4di)vReg, 1);
+        }
+    }
+}
+
+
+
+static void
+analyzeBlock(struct pam *   const inpamP,
+             bit **         const inrow,
+             int            const xdir,
+             const bit **   const block,
+             unsigned int * const topOfFullBlockP,
+             unsigned int * const outcol16P) {
+
+    if (xdir > 0){
+        /* Write output columns left to right */
+        unsigned int i;
+        for (i = 0; i < 16; ++i)
+            block[i] = inrow[(i & 0x8) + 7 - (i & 0x7)];
+        *topOfFullBlockP = 0;
+        *outcol16P = 0;
+    } else {
+        /* Write output columns right to left */
+        *topOfFullBlockP = inpamP->height % 16;
+
+        if (inpamP->height >= 16) {
+            unsigned int i;
+            for (i = 0; i < 16; ++i)
+                block[i]= inrow[((i & 0x8) ^ 0x8) + (i & 0x7)];
+            *outcol16P = inpamP->height/16 - 1;
+        } else
+            *outcol16P = 0;
+    }
+}
+
+
+
+static void
+doPartialBlockTop(struct pam * const inpamP,
+                  bit **       const inrow,
+                  int          const xdir,
+                  unsigned int const topOfFullBlock,
+                  uint16_t **  const outplane) {
+    
+    if (topOfFullBlock > 0) {
+        unsigned int row;
+        for (row = 0; row < topOfFullBlock; ++row)
+            pbm_readpbmrow_packed(inpamP->file, inrow[row],
+                                  inpamP->width, inpamP->format);
+
+        transpose1to15Bitrows(inpamP->width, inpamP->height,
+                              inrow, outplane, xdir);
+            /* Transpose partial rows on top of input.  Place on right edge of
+               output.
+            */ 
+    }
+}
+
+
+
+static void
+doFullBlocks(struct pam * const inpamP,
+             bit **       const inrow,
+             int          const xdir,
+             const bit *  const block[16],
+             unsigned int const topOfFullBlock,
+             unsigned int const initOutcol16,
+             uint16_t **  const outplane) {
+
+    unsigned int row;
+    unsigned int outcol16;
+    unsigned int modrow;
+        /* Number of current row within buffer */
+
+    for (row = topOfFullBlock, outcol16 = initOutcol16, modrow = 0;
+         row < inpamP->height;
+         ++row) {
+
+        pbm_readpbmrow_packed(inpamP->file, inrow[modrow],
+                              inpamP->width, inpamP->format);
+        ++modrow;
+        if (modrow == 16) {
+            /* 16 row buffer is full.  Transpose. */
+            modrow = 0; 
+
+            transpose16Bitrows(inpamP->width, inpamP->height,
+                               block, outplane, outcol16);
+            outcol16 += xdir;
+        }
+    }
+}
+
+
+
+static void
+doPartialBlockBottom(struct pam * const inpamP,
+                     bit **       const inrow,
+                     int          const xdir,
+                     uint16_t **  const outplane) {
+    
+    if (xdir > 0 && inpamP->height % 16 > 0)
+        transpose1to15Bitrows(inpamP->width, inpamP->height, inrow,
+                              outplane, xdir);
+        /* Transpose partial rows on bottom of input.  Place on right edge of
+           output.
+        */ 
+}
+
+
+
+static void
+writeOut(struct pam * const outpamP,
+         uint16_t **  const outplane,
+         int          const ydir) {
+             
+    unsigned int row;
+
+    for (row = 0; row < outpamP->height; ++row) {
+        unsigned int const outrow = (ydir > 0) ?
+            row :
+            outpamP->height - row - 1;  /* reverse order */
+  
+        pbm_writepbmrow_packed(stdout, (bit *)outplane[outrow],
+                               outpamP->width, 0);
+    }
+}
+
+
+
+void
+pamflip_transformRowsToColumnsPbmSse(struct pam *     const inpamP,
+                                     struct pam *     const outpamP,
+                                     struct xformCore const xformCore) { 
+/*----------------------------------------------------------------------------
+  This is a specialized routine for row-for-column PBM transformations.
+  (-cw, -ccw, -xy).
+-----------------------------------------------------------------------------*/
+    int const xdir = xformCore.c;
+        /* Input top:  output left (+1)/ right (-1)  */
+    int const ydir = xformCore.b;
+        /* Input left: output top  (+1)/ bottom (-1) */
+
+    bit ** inrow;
+    uint16_t ** outplane;
+    const bit * block[16];
+    unsigned int topOfFullBlock;
+    unsigned int outcol16;
+
+    inrow = pbm_allocarray_packed( inpamP->width, 16);
+    outplane = (uint16_t **)pbm_allocarray_packed(outpamP->width + 15,
+                                                  outpamP->height);
+        /* We write to the output array in 16 bit units.  Add margin (15). */  
+
+    analyzeBlock(inpamP, inrow, xdir, block, &topOfFullBlock, &outcol16);
+
+    doPartialBlockTop(inpamP, inrow, xdir, topOfFullBlock, outplane);
+
+    doFullBlocks(inpamP, inrow, xdir, block,
+                 topOfFullBlock, outcol16, outplane);
+
+    doPartialBlockBottom(inpamP, inrow, xdir, outplane);
+
+    writeOut(outpamP, outplane, ydir);
+
+    pbm_freearray(outplane, outpamP->height);
+    pbm_freearray(inrow, 16);
+}
+#else  /* SSE functions exist */
+
+void
+pamflip_transformRowsToColumnsPbmSse(struct pam *     const inpamP,
+                                     struct pam *     const outpamP,
+                                     struct xformCore const xformCore) { 
+
+    /* Nobody is supposed to call this */
+    assert(false);
+}
+#endif 
-- 
cgit 1.4.1