cleanup

git-svn-id: http://svn.code.sf.net/p/netpbm/code/trunk@1248 9d0c8265-081b-0410-96cb-a4ca84ce46f8
author: giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8> 2010-06-27 17:57:35 +0000
committer: giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8> 2010-06-27 17:57:35 +0000
commit: e0d1d6e5f3836c0f5b2bf20a8666b312802a540e (patch)
tree: 543bfd8eeba09d2a50b6b6c6ec3fbf6a60becad5 /editor
parent: 7c7b2eba2d70845da2ca2c95e3cc0f6d91c0eb2c (diff)
download: netpbm-mirror-e0d1d6e5f3836c0f5b2bf20a8666b312802a540e.tar.gz
netpbm-mirror-e0d1d6e5f3836c0f5b2bf20a8666b312802a540e.tar.xz
netpbm-mirror-e0d1d6e5f3836c0f5b2bf20a8666b312802a540e.zip
2 files changed, 193 insertions, 156 deletions
diff --git a/editor/pamflip/pamflip_sse.c b/editor/pamflip/pamflip_sse.c
index 5e39e719..f953ab68 100644
--- a/editor/pamflip/pamflip_sse.c
+++ b/editor/pamflip/pamflip_sse.c
@@ -41,87 +41,55 @@
    The following routines can write both in both directions (left and right)
    into the output rows.  They do this by controlling the vertical stacking
    order when they make the 8x16 blocks.
- 
-   Function transpose1to15Bitrows() is for handling the partial bits of each
-   output row.  They can come from either the top or bottom of the vertical
-   input column, but they always go to the right end of the output rows.
-
-   transformRowsToColumnsPbm() does not have any instructions unique to
-   GCC or SSE.  It is possible to write a non-SSE version by providing
-   generic versions of transpose16Bitrows() and transpose1to15Bitrows() .
-   This is just a matter of replacing the V16 union with a plain uchar
-   array and writing an emulation for __builtin_pmovmskb128() .
+
+   We do all transposition in 8x16 block units, adding padding to
+   the 8 row input buffer and the output plane raster as necessary.
+   doPartialBlockTop() or doPartialBlockBottom() handles the partial
+   input band.  This part can come from either the top or bottom of the
+   vertical input column, but always goes to the right end of the output
+   rows.
+
+   As an enhancement, we clear the output raster to zero (=white) in the
+   beginning and flip only the 8x16 blocks that contain non-zero bits (=any
+   amount of black pixels).  When we add padding to the edges, we initialize
+   it all to zero to prevent unnecessary transpositions.
+
+   All instructions unique to GCC/SSE are in transpose16Bitrows().
+   It is possible to write a non-SSE version by providing a generic
+   version of transpose16Bitrows() or one tuned for a specific
+   architecture.  Use 8x8 blocks to avoid endian issues.
  
    Further enhancement should be possible by employing wider bands,
-   larger blocks as wider SIMD registers become available.  Another
-   method is checking for white blocks and recording them in a small
-   array and condensing writes into the output raster array.
+   larger blocks as wider SIMD registers become available.  Clearing
+   the white parts after instead of before transposition is also a
+   possibility.
 -----------------------------------------------------------------------------*/
 
 typedef char v16qi __attribute__ ((vector_size (16)));
 typedef int  v4di  __attribute__ ((vector_size (16)));
 
-union V16 {
-    v16qi v;
-    v4di d;
-    unsigned char i[16];
-};
-
-/* Beware when making modifications to code which involve v16qi, v4di, V16.
-   Certain versions of GCC get stuck with the following:
+/* Beware when making modifications to code which involve SSE.
+   This is a sensitive part of GCC.  Different compiler versions
+   respond differently to trivial matters such as the difference
+   between above v16qi, v4di and a union defined for handling both.
+   What can be placed into a register is another issue.  Some
+   compilers issue warnings, others abort with error.
 
-   (1) Type mismatches between v16qi and v4di.  Avoid them with casts.
+   A char array cannot be loaded into v16qi by casting.  A vector
+   variable must be vector from the beginning.
 
-   (2) Converions from a 16 byte array to v16qi (or union V16) by cast.
-       __vector__ variables have to be vector from the start. 
+   Changes for your local system are okay, but if you intend to
+   publish the them, please specify the compiler version you used.
 
-   (3) union V16 as a register variable.
-
-   Some GCC versions emit warnings, others abort with error.
+   This code has been tested on gcc versions: 4.2.0, 4.2.4, 4.3.2,
+   4.4.3, 4.4.4 and 4.5.0 .
 */
 
 
 
 static void
-transpose1to15Bitrows(unsigned int const cols,
-                      unsigned int const rows,
-                      bit **       const inrow,
-                      uint16_t **  const outplane,
-                      int          const xdir) {
-/*--------------------------------------------------------------------------
-  Convert input rows to output columns.  For handling partial rows.
-  Note that output from this always goes to the right edge of the image.
-----------------------------------------------------------------------------*/
-    unsigned int const outcol16 = (rows-1)/16;
-
-    unsigned int col;
-
-    union V16 v16;
-    v16.v = v16.v ^ v16.v;  /* clear to zero */
-
-    for (col = 0; col < cols; ++col) {
-        unsigned int const outrow = col;
-
-        if (col % 8 == 0) {
-            unsigned int i;
-            for (i = 0; i < rows % 16; ++i) {
-                int const idx = (xdir > 0) ?
-                    (i&8) + 7-(i&7) :       /* output left to right */
-                    (24- rows%16 +i) %16;  /*        right to left */
-                v16.i [idx] = inrow[i][col/8];
-            }
-        }
-        /* read 16 bits from left edge of block; write to output column  */
-        outplane[outrow][outcol16] = __builtin_ia32_pmovmskb128(v16.v);
-        v16.d = __builtin_ia32_pslldi128(v16.d, 1);
-    }
-}
-
-
-
-static void
 transpose16Bitrows(unsigned int const cols,
-      	           unsigned int const rows,
+                   unsigned int const rows,
                    const bit *  const block[16],
                    uint16_t **  const outplane,
                    unsigned int const outcol16) {
@@ -132,18 +100,25 @@ transpose16Bitrows(unsigned int const cols,
   (xdir > 0) ? (i & 0x08) + 7 - (i & 0x07) : (24 - rows % 16 +i) % 16
   for efficiency.
 
-  We avoid using union V16 to keep the value in a register.  (When we do so,
-  GCC (4.2, 4.4) sees the suffix x of v16.i[x] and apparently decides that
-  the variable has to be addressable and therefore needs to be placed into
-  memory.)
+  We load the block directly into a register.  (using a union like:
+
+       union V16 {
+          v16qi v;
+          unsigned char i[16];
+       };
+  )
+
+  gcc (v. 4.2, 4.4) sees the suffix [x] of v16.i[x] and apparently decides
+  that the variable has to be addressable and therefore needs to be placed
+  into memory.)
 ---------------------------------------------------------------------------*/
     unsigned int col;
+    register v16qi zero128;   /* 16 bytes of zero, in a SSE register */
 
-    for (col = 0; col + 7 < cols; col += 8) {    /* Unrolled loop */
-        unsigned int const col8 = col / 8;
+    zero128 = zero128 ^ zero128;
 
-        unsigned int outrow;
-        unsigned int i;
+    for (col = 0; col < cols; col += 8) {
+        unsigned int const col8 = col / 8;
 
         register v16qi vReg = {
             block[0][col8],  block[1][col8],
@@ -155,33 +130,26 @@ transpose16Bitrows(unsigned int const cols,
             block[12][col8], block[13][col8],
             block[14][col8], block[15][col8] };
 
-        outrow = col;  /* initial value */
+        register v16qi const compare =__builtin_ia32_pcmpeqb128(vReg,zero128);
 
-        for (i = 0; i < 7; ++i) {
-            /* GCC (>=4.2) automatically unrolls this loop */  
-            outplane[outrow++][outcol16] = __builtin_ia32_pmovmskb128(vReg);
-            vReg = (v16qi)__builtin_ia32_pslldi128 ((v4di)vReg, 1);
-        }
-        outplane[outrow][outcol16] = __builtin_ia32_pmovmskb128(vReg);
-    }
+        if (__builtin_ia32_pmovmskb128(compare) != 0xffff) {
 
-    if (col < cols) {  /* Transpose remaining pixels at end of input rows. */
-        unsigned int const col8 = col / 8;
-        register v16qi vReg = {
-            block[0][col8],  block[1][col8],
-            block[2][col8],  block[3][col8],  
-            block[4][col8],  block[5][col8],
-            block[6][col8],  block[7][col8],
-            block[8][col8],  block[9][col8],
-            block[10][col8], block[11][col8],
-            block[12][col8], block[13][col8],
-            block[14][col8], block[15][col8] };
+            /* There is some black content in this block; write to outplane */
+            
+            unsigned int outrow;
+            unsigned int i;
 
-        for ( ; col < cols; ++col) { 
-            unsigned int const outrow = col;
+            outrow = col;  /* initial value */
 
+            for (i = 0; i < 7; ++i) {
+                /* GCC (>=4.2) automatically unrolls this loop */  
+                outplane[outrow++][outcol16] =
+                    __builtin_ia32_pmovmskb128(vReg);
+                vReg = (v16qi)__builtin_ia32_pslldi128 ((v4di)vReg, 1);
+            }
             outplane[outrow][outcol16] = __builtin_ia32_pmovmskb128(vReg);
-            vReg = (v16qi)__builtin_ia32_pslldi128((v4di)vReg, 1);
+        } else {
+            /* The block is completely white; skip. */
         }
     }
 }
@@ -189,28 +157,50 @@ transpose16Bitrows(unsigned int const cols,
 
 
 static void
-analyzeBlock(struct pam *   const inpamP,
-             bit **         const inrow,
-             int            const xdir,
-             const bit **   const block,
-             unsigned int * const topOfFullBlockP,
-             unsigned int * const outcol16P) {
-
+analyzeBlock(const struct pam * const inpamP,
+             bit **             const inrow,
+             int                const xdir,
+             const bit **       const block,
+             const bit **       const blockPartial,
+             unsigned int *     const topOfFullBlockP,
+             unsigned int *     const outcol16P) {
+/*--------------------------------------------------------------------------
+  Set up block[] pointer array.  Provide for both directions and the two
+  "twists" brought about by Intel byte ordering which occur when:
+    (1) 16 bytes are loaded to a SSE register
+    (2) 16 bits are written to memory.
+ 
+  If 'rows' is not a multiple of 8, a partial input band appears at one edge.
+  Set *topOfFullBlockP accordingly.  blockPartial[] is an adjusted "block" for
+  this partial band, brought up to a size of 8 rows.  The extra pointers point
+  to a single row which doPartialBlockTop() and doPartialBlockBottom() clear
+  to white.
+---------------------------------------------------------------------------*/
     if (xdir > 0){
         /* Write output columns left to right */
         unsigned int i;
-        for (i = 0; i < 16; ++i)
-            block[i] = inrow[(i & 0x8) + 7 - (i & 0x7)];
+        for (i = 0; i < 16; ++i){
+            unsigned int const index = (i & 0x8) + 7 - (i & 0x7);
+            /* Absorb little-endian "twists" */
+            block[i] = inrow[index];
+            blockPartial[i] = index < inpamP->height%16 ? block[i] : inrow[15];
+        }
         *topOfFullBlockP = 0;
         *outcol16P = 0;
     } else {
         /* Write output columns right to left */
+        unsigned int i;
+        for (i = 0; i < 16; ++i){
+            unsigned int const index = ((i & 0x8) ^ 0x8) + (i & 0x7);
+            /* Absorb little-endian "twists" */
+            block[i]= inrow[index];
+            blockPartial[i] = index < (16-inpamP->height%16)
+                ? inrow[0]
+                : block[i];
+        }
         *topOfFullBlockP = inpamP->height % 16;
-
+    
         if (inpamP->height >= 16) {
-            unsigned int i;
-            for (i = 0; i < 16; ++i)
-                block[i]= inrow[((i & 0x8) ^ 0x8) + (i & 0x7)];
             *outcol16P = inpamP->height/16 - 1;
         } else
             *outcol16P = 0;
@@ -220,20 +210,33 @@ analyzeBlock(struct pam *   const inpamP,
 
 
 static void
-doPartialBlockTop(struct pam * const inpamP,
-                  bit **       const inrow,
-                  int          const xdir,
-                  unsigned int const topOfFullBlock,
-                  uint16_t **  const outplane) {
+doPartialBlockTop(const struct pam * const inpamP,
+                  bit **             const inrow,
+                  const bit *        const blockPartial[16],
+                  unsigned int       const topOfFullBlock,
+                  uint16_t **        const outplane) {
     
     if (topOfFullBlock > 0) {
-        unsigned int row;
-        for (row = 0; row < topOfFullBlock; ++row)
-            pbm_readpbmrow_packed(inpamP->file, inrow[row],
+        unsigned int colChar, row;
+        unsigned int pad = 16 - topOfFullBlock;
+
+        for (colChar=0; colChar < pbm_packed_bytes(inpamP->width); ++colChar)
+            inrow[0][colChar] = 0x00;
+
+        for (row = 0; row < topOfFullBlock; ++row){
+            pbm_readpbmrow_packed(inpamP->file, inrow[row+pad],
                                   inpamP->width, inpamP->format);
+            if (inpamP->width % 8 > 0){
+                /* Clear partial byte at end of input row */
+                int const lastByte = pbm_packed_bytes(inpamP->width) -1;
+
+                inrow[row+pad][lastByte] >>= (8 - inpamP->width % 8);
+                inrow[row+pad][lastByte] <<= (8 - inpamP->width % 8);
+            }
+        }
 
-        transpose1to15Bitrows(inpamP->width, inpamP->height,
-                              inrow, outplane, xdir);
+        transpose16Bitrows(inpamP->width, inpamP->height, blockPartial,
+                           outplane, inpamP->height /16);
             /* Transpose partial rows on top of input.  Place on right edge of
                output.
             */ 
@@ -243,18 +246,18 @@ doPartialBlockTop(struct pam * const inpamP,
 
 
 static void
-doFullBlocks(struct pam * const inpamP,
-             bit **       const inrow,
-             int          const xdir,
-             const bit *  const block[16],
-             unsigned int const topOfFullBlock,
-             unsigned int const initOutcol16,
-             uint16_t **  const outplane) {
+doFullBlocks(const struct pam * const inpamP,
+             bit **             const inrow,
+             int                const xdir,
+             const bit *        const block[16],
+             unsigned int       const topOfFullBlock,
+             unsigned int       const initOutcol16,
+             uint16_t **        const outplane) {
 
     unsigned int row;
     unsigned int outcol16;
     unsigned int modrow;
-        /* Number of current row within buffer */
+    /* Number of current row within buffer */
 
     for (row = topOfFullBlock, outcol16 = initOutcol16, modrow = 0;
          row < inpamP->height;
@@ -262,6 +265,13 @@ doFullBlocks(struct pam * const inpamP,
 
         pbm_readpbmrow_packed(inpamP->file, inrow[modrow],
                               inpamP->width, inpamP->format);
+        if (inpamP->width % 8 > 0) {
+            /* Clear partial byte at end of input row */
+            int const lastByte = pbm_packed_bytes(inpamP->width) - 1;
+            inrow[modrow][lastByte] >>= (8 - inpamP->width % 8);
+            inrow[modrow][lastByte] <<= (8 - inpamP->width % 8);
+        }
+
         ++modrow;
         if (modrow == 16) {
             /* 16 row buffer is full.  Transpose. */
@@ -277,25 +287,32 @@ doFullBlocks(struct pam * const inpamP,
 
 
 static void
-doPartialBlockBottom(struct pam * const inpamP,
-                     bit **       const inrow,
-                     int          const xdir,
-                     uint16_t **  const outplane) {
+doPartialBlockBottom(const struct pam * const inpamP,
+                     bit **             const inrow,
+                     int                const xdir,
+                     const bit *        const blockPartial[16],
+                     uint16_t **        const outplane) {
     
-    if (xdir > 0 && inpamP->height % 16 > 0)
-        transpose1to15Bitrows(inpamP->width, inpamP->height, inrow,
-                              outplane, xdir);
-        /* Transpose partial rows on bottom of input.  Place on right edge of
-           output.
-        */ 
+    if (xdir > 0 && inpamP->height % 16 > 0) {
+        unsigned int colChar;
+
+        for (colChar=0; colChar < pbm_packed_bytes(inpamP->width); ++colChar)
+            inrow[15][colChar] = 0x00;
+
+        transpose16Bitrows(inpamP->width, inpamP->height, blockPartial,
+                           outplane, inpamP->height/16);
+            /* Transpose partial rows on bottom of input.  Place on right edge
+               of output.
+            */ 
+    }
 }
 
 
 
 static void
-writeOut(struct pam * const outpamP,
-         uint16_t **  const outplane,
-         int          const ydir) {
+writeOut(const struct pam * const outpamP,
+         uint16_t **        const outplane,
+         int                const ydir) {
              
     unsigned int row;
 
@@ -303,57 +320,77 @@ writeOut(struct pam * const outpamP,
         unsigned int const outrow = (ydir > 0) ?
             row :
             outpamP->height - row - 1;  /* reverse order */
-  
+
         pbm_writepbmrow_packed(stdout, (bit *)outplane[outrow],
                                outpamP->width, 0);
     }
 }
 
 
+static void
+clearOutplane(const struct pam * const outpamP,
+              uint16_t **        const outplane) { 
+    
+    unsigned int row;
+    
+    for (row = 0; row < outpamP->height; ++row) {
+        unsigned int col16;  /* column divided by 16 */
+        for (col16 = 0; col16 < (outpamP->width + 15)/16; ++col16)
+            outplane[row][col16] = 0x0000;
+    }
+} 
+
+
 
 void
-pamflip_transformRowsToColumnsPbmSse(struct pam *     const inpamP,
-                                     struct pam *     const outpamP,
+pamflip_transformRowsToColumnsPbmSse(const struct pam * const inpamP,
+                                     const struct pam * const outpamP,
                                      struct xformCore const xformCore) { 
 /*----------------------------------------------------------------------------
   This is a specialized routine for row-for-column PBM transformations.
   (-cw, -ccw, -xy).
 -----------------------------------------------------------------------------*/
     int const xdir = xformCore.c;
-        /* Input top:  output left (+1)/ right (-1)  */
+        /* Input top  => output left (+1)/ right (-1)  */
     int const ydir = xformCore.b;
-        /* Input left: output top  (+1)/ bottom (-1) */
+        /* Input left => output top  (+1)/ bottom (-1) */
+    int const blocksPerRow = ((unsigned int) outpamP->width + 15) /16;
 
     bit ** inrow;
     uint16_t ** outplane;
     const bit * block[16];
+    const bit * blockPartial[16];
     unsigned int topOfFullBlock;
     unsigned int outcol16;
 
     inrow = pbm_allocarray_packed( inpamP->width, 16);
-    outplane = (uint16_t **)pbm_allocarray_packed(outpamP->width + 15,
-                                                  outpamP->height);
-        /* We write to the output array in 16 bit units.  Add margin (15). */  
+    outplane =
+      (uint16_t **) pm_allocarray( blocksPerRow, outpamP->height + 7, 2);
+
+    /* We write to the output array in 16 bit units.  Add margin. */  
+
+    clearOutplane(outpamP, outplane);
 
-    analyzeBlock(inpamP, inrow, xdir, block, &topOfFullBlock, &outcol16);
+    analyzeBlock(inpamP, inrow, xdir, block, blockPartial, 
+                 &topOfFullBlock, &outcol16);
 
-    doPartialBlockTop(inpamP, inrow, xdir, topOfFullBlock, outplane);
+    doPartialBlockTop(inpamP, inrow, blockPartial, topOfFullBlock, outplane);
 
     doFullBlocks(inpamP, inrow, xdir, block,
                  topOfFullBlock, outcol16, outplane);
 
-    doPartialBlockBottom(inpamP, inrow, xdir, outplane);
+    doPartialBlockBottom(inpamP, inrow, xdir, blockPartial, outplane);
 
     writeOut(outpamP, outplane, ydir);
 
-    pbm_freearray(outplane, outpamP->height);
+    pbm_freearray(outplane, outpamP->height + 7);
     pbm_freearray(inrow, 16);
 }
 #else  /* SSE functions exist */
 
 void
-pamflip_transformRowsToColumnsPbmSse(struct pam *     const inpamP,
-                                     struct pam *     const outpamP,
+pamflip_transformRowsToColumnsPbmSse(const struct pam * const inpamP,
+                                     const struct pam * const outpamP,
                                      struct xformCore const xformCore) { 
 
     /* Nobody is supposed to call this */
diff --git a/editor/pamflip/pamflip_sse.h b/editor/pamflip/pamflip_sse.h
index 1e70b765..59e7c026 100644
--- a/editor/pamflip/pamflip_sse.h
+++ b/editor/pamflip/pamflip_sse.h
@@ -5,8 +5,8 @@ struct pam;
 #include "flip.h"
 
 void
-pamflip_transformRowsToColumnsPbmSse(struct pam *     const inpamP,
-                                     struct pam *     const outpamP,
-                                     struct xformCore const xformCore);
+pamflip_transformRowsToColumnsPbmSse(const struct pam *     const inpamP,
+                                     const struct pam *     const outpamP,
+                                     struct xformCore       const xformCore);
 
 #endif
author	giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8>	2010-06-27 17:57:35 +0000
committer	giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8>	2010-06-27 17:57:35 +0000
commit	e0d1d6e5f3836c0f5b2bf20a8666b312802a540e (patch)
tree	543bfd8eeba09d2a50b6b6c6ec3fbf6a60becad5 /editor
parent	7c7b2eba2d70845da2ca2c95e3cc0f6d91c0eb2c (diff)
download	netpbm-mirror-e0d1d6e5f3836c0f5b2bf20a8666b312802a540e.tar.gz netpbm-mirror-e0d1d6e5f3836c0f5b2bf20a8666b312802a540e.tar.xz netpbm-mirror-e0d1d6e5f3836c0f5b2bf20a8666b312802a540e.zip