From 5d8e96684997df4783e50599d7de3aef7eaa2103 Mon Sep 17 00:00:00 2001
From: giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8>
Date: Thu, 13 May 2010 14:17:39 +0000
Subject: Use SSE instead of MMX

git-svn-id: http://svn.code.sf.net/p/netpbm/code/trunk@1211 9d0c8265-081b-0410-96cb-a4ca84ce46f8
---
 lib/libpbm3.c | 141 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 82 insertions(+), 59 deletions(-)

(limited to 'lib/libpbm3.c')

diff --git a/lib/libpbm3.c b/lib/libpbm3.c
index 9200d30e..b41f90de 100644
--- a/lib/libpbm3.c
+++ b/lib/libpbm3.c
@@ -15,16 +15,20 @@
 #include "pm_c_util.h"
 #include "pbm.h"
 
-#if HAVE_GCC_MMXSSE
-#include "bitreverse.h"
+#ifndef PACKBITS_SSE
+#if HAVE_GCC_SSE2 && HAVE_GCC_BSWAP && defined(__SSE2__)
+  #define PACKBITS_SSE 2
+#else
+  #define PACKBITS_SSE 0
+#endif
 #endif
 
-/* HAVE_GCC_MMXSSE means we have the means to use MMX and SSE CPU facilities
+/* HAVE_GCC_SSE2 means we have the means to use SSE CPU facilities
    to make PBM raster processing faster.  GCC only.
 
-   The GNU Compiler -msse option makes SSE available.
-   For x86-32 with MMX/SSE, "-msse" must be explicitly given.
-   For x86-64 and AMD64, "-msse" is on by default.
+   The GNU Compiler -msse2 option makes SSE/SSE2 available.
+   For x86-32 with MMX/SSE, "-msse2" must be explicitly given.
+   For x86-64 and AMD64, "-msse2" is the default (from Gcc v.4.)
 */
 
 void
@@ -56,80 +60,97 @@ writePackedRawRow(FILE *                const fileP,
 } 
 
 
-#if HAVE_GCC_MMXSSE
+
+#if PACKBITS_SSE == 2
 static void
-packBitsWithMmxSse(FILE *          const fileP,
+packBitsWithSse2(  FILE *          const fileP,
                    const bit *     const bitrow,
                    unsigned char * const packedBits,
-                   unsigned int    const cols,
-                   unsigned int *  const nextColP) {
+                   unsigned int    const cols) {
 /*----------------------------------------------------------------------------
-   Pack the bits of bitrow[] into bytes at 'packedBits'.  Going left to right,
-   stop when there aren't enough bits left to fill a whole byte.  Return
-   as *nextColP the number of the next column after the rightmost one we
-   packed.
+    Pack the bits of bitrow[] into bytes at 'packedBits'.
+
+    Use the SSE2 facilities to pack the bits quickly, but
+    perform the exact same function as the simpler
+    packBitsGeneric() + packPartialBytes()
 
-   Use the Pentium MMX and SSE facilities to pack the bits quickly, but
-   perform the exact same function as the simpler packBitsGeneric().
+    Unlike packBitsGeneric(), the whole row is converted.
 -----------------------------------------------------------------------------*/
     /*
-      We use MMX/SSE facilities that operate on 8 bytes at once to pack
-      the bits quickly.
-    
-      We use 2 MMX registers (no SSE registers).
+      We use 2 SSE registers.
     
       The key machine instructions are:
+        
+      PCMPGTB128  Packed CoMPare Greater Than Byte
     
-    
-      PCMPGTB  Packed CoMPare Greater Than Byte
-    
-        Compares 8 bytes in parallel
+        Compares 16 bytes in parallel
         Result is x00 if greater than, xFF if not for each byte       
     
-      PMOVMSKB Packed MOVe MaSK Byte 
+      PMOVMSKB128 Packed MOVe MaSK Byte 
     
-        Result is a byte of the MSBs of 8 bytes
-        x00 xFF x00 xFF xFF xFF x00 x00 --> 01011100B = 0x5C
+        Result is a byte of the MSBs of 16 bytes
+        x00 xFF x00 xFF xFF xFF x00 x00 xFF xFF xFF xFF x00 x00 x00 x00 
+        --> 0101110011110000B = 0x5CF0
         
-        The result is actually a 32 bit int, but the higher bits are
-        always 0.  (0x0000005C in the above case)
-    
-      EMMS     Empty MMx State
-    
-        Free MMX registers  
-    
+        The result is actually a 64 bit int, but the higher bits are
+        always 0.
     */
 
-
-    typedef char v8qi __attribute__ ((vector_size(8)));
-    typedef int di __attribute__ ((mode(DI)));
+    typedef char v16qi __attribute__ ((vector_size(16)));
 
     unsigned int col;
-    v8qi const zero64 =(v8qi)((di)0);  /* clear to zero */
-
-    for (col = 0; col + 7 < cols; col += 8) {
-
-        v8qi const compare =
-            __builtin_ia32_pcmpgtb(*(v8qi*) (&bitrow[col]), (v8qi) zero64);
-        uint32_t const backwardBlackMask =  __builtin_ia32_pmovmskb(compare);
-        unsigned char const blackMask = bitreverse[backwardBlackMask];
-
-        packedBits[col/8] = blackMask;
+    union {
+        v16qi    v16;
+        uint64_t i64[2];
+        unsigned char byte[16];
+    } bit128;
+
+    v16qi zero128;
+    zero128 = zero128 ^ zero128;   /* clear to zero */
+
+    for (col = 0; col + 15 < cols; col += 16) {
+        bit128.i64[0]=__builtin_bswap64( *(uint64_t*) &bitrow[col]);
+        bit128.i64[1]=__builtin_bswap64( *(uint64_t*) &bitrow[col+8]);
+
+        {
+            v16qi const compare =
+                __builtin_ia32_pcmpgtb128(bit128.v16, zero128);
+            uint16_t const blackMask = 
+                (uint16_t) __builtin_ia32_pmovmskb128(compare);
+
+            *(uint16_t *) & packedBits[col/8] = blackMask;
+        }
     }
-    *nextColP = col;
 
-    __builtin_ia32_emms();
+    if (cols % 16 > 0) {
+        unsigned int i, j;
 
+        bit128.v16 = bit128.v16 ^ bit128.v16;
+    
+        for (i = 0, j = col ; j < cols; ++i, ++j) 
+            bit128.byte[ (i&8) + 7-(i&7) ] = bitrow[j];
+      
+        {
+            v16qi const compare =
+                __builtin_ia32_pcmpgtb128( bit128.v16, zero128 );
+            uint16_t const blackMask =
+                __builtin_ia32_pmovmskb128( compare );
+
+            if ( cols%16 >8 )  /* Two partial bytes */
+                *(uint16_t *) & packedBits[col/8] = blackMask;
+            else              /* One partial byte */
+                packedBits[col/8] = (unsigned char) blackMask ;
+        }
+    }
 }
 #else
 /* Avoid undefined function warning; never actually called */
 
-#define packBitsWithMmxSse(a,b,c,d,e) packBitsGeneric(a,b,c,d,e)
+#define packBitsWithSse2(a,b,c,d) packBitsGeneric((a),(b),(c),(d),NULL)
 #endif
 
 
 
-
 static unsigned int
 bitValue(unsigned char const byteValue) {
 
@@ -212,18 +233,20 @@ writePbmRowRaw(FILE *      const fileP,
         pm_setjmpbuf(origJmpbufP);
         pm_longjmp();
     } else {
-        unsigned int nextCol;
 
         pm_setjmpbufsave(&jmpbuf, &origJmpbufP);
 
-        if (HAVE_GCC_MMXSSE)
-            packBitsWithMmxSse(fileP, bitrow, packedBits, cols, &nextCol);
-        else 
+        switch (PACKBITS_SSE) {
+        case 2: 
+            packBitsWithSse2(fileP, bitrow, packedBits, cols);
+            break;
+        default: {
+            unsigned int nextCol;
             packBitsGeneric(fileP, bitrow, packedBits, cols, &nextCol);
-
-        if (cols % 8 > 0)
-            packPartialBytes(bitrow, cols, nextCol, packedBits);
-        
+            if (cols % 8 > 0)
+                packPartialBytes(bitrow, cols, nextCol, packedBits);
+        }
+        }
         writePackedRawRow(fileP, packedBits, cols);
 
         pm_setjmpbuf(origJmpbufP);
-- 
cgit 1.4.1