Use SSE instead of MMX

git-svn-id: http://svn.code.sf.net/p/netpbm/code/trunk@1211 9d0c8265-081b-0410-96cb-a4ca84ce46f8
author: giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8> 2010-05-13 14:17:39 +0000
committer: giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8> 2010-05-13 14:17:39 +0000
commit: 5d8e96684997df4783e50599d7de3aef7eaa2103 (patch)
tree: b3b3961df760d26aea48c8b50ab23df9a37b852e
parent: 78f628415fdc9d3108de114499a748bac5f73d9e (diff)
download: netpbm-mirror-5d8e96684997df4783e50599d7de3aef7eaa2103.tar.gz
netpbm-mirror-5d8e96684997df4783e50599d7de3aef7eaa2103.tar.xz
netpbm-mirror-5d8e96684997df4783e50599d7de3aef7eaa2103.zip
3 files changed, 102 insertions, 68 deletions
diff --git a/doc/HISTORY b/doc/HISTORY
index f5ebe4f3..ed422a16 100644
--- a/doc/HISTORY
+++ b/doc/HISTORY
@@ -19,6 +19,9 @@ not yet  BJH  Release 10.51.00
               pnmsmooth: Don't display pnmconvol messages (i.e. run
               pnmconvol with -quiet).
 
+              libnetpbm, various PBM programs: Use SSE insted of MMX.  Thanks
+              Prophet of the Way <afu@wta.att.ne.jp>.
+
               pnmtops: fix bug: 12 bits per sample output when 8 would do.
               Introduced in 10.40.
 
diff --git a/lib/libpbm3.c b/lib/libpbm3.c
index 9200d30e..b41f90de 100644
--- a/lib/libpbm3.c
+++ b/lib/libpbm3.c
@@ -15,16 +15,20 @@
 #include "pm_c_util.h"
 #include "pbm.h"
 
-#if HAVE_GCC_MMXSSE
-#include "bitreverse.h"
+#ifndef PACKBITS_SSE
+#if HAVE_GCC_SSE2 && HAVE_GCC_BSWAP && defined(__SSE2__)
+  #define PACKBITS_SSE 2
+#else
+  #define PACKBITS_SSE 0
+#endif
 #endif
 
-/* HAVE_GCC_MMXSSE means we have the means to use MMX and SSE CPU facilities
+/* HAVE_GCC_SSE2 means we have the means to use SSE CPU facilities
    to make PBM raster processing faster.  GCC only.
 
-   The GNU Compiler -msse option makes SSE available.
-   For x86-32 with MMX/SSE, "-msse" must be explicitly given.
-   For x86-64 and AMD64, "-msse" is on by default.
+   The GNU Compiler -msse2 option makes SSE/SSE2 available.
+   For x86-32 with MMX/SSE, "-msse2" must be explicitly given.
+   For x86-64 and AMD64, "-msse2" is the default (from Gcc v.4.)
 */
 
 void
@@ -56,80 +60,97 @@ writePackedRawRow(FILE *                const fileP,
 } 
 
 
-#if HAVE_GCC_MMXSSE
+
+#if PACKBITS_SSE == 2
 static void
-packBitsWithMmxSse(FILE *          const fileP,
+packBitsWithSse2(  FILE *          const fileP,
                    const bit *     const bitrow,
                    unsigned char * const packedBits,
-                   unsigned int    const cols,
-                   unsigned int *  const nextColP) {
+                   unsigned int    const cols) {
 /*----------------------------------------------------------------------------
-   Pack the bits of bitrow[] into bytes at 'packedBits'.  Going left to right,
-   stop when there aren't enough bits left to fill a whole byte.  Return
-   as *nextColP the number of the next column after the rightmost one we
-   packed.
+    Pack the bits of bitrow[] into bytes at 'packedBits'.
+
+    Use the SSE2 facilities to pack the bits quickly, but
+    perform the exact same function as the simpler
+    packBitsGeneric() + packPartialBytes()
 
-   Use the Pentium MMX and SSE facilities to pack the bits quickly, but
-   perform the exact same function as the simpler packBitsGeneric().
+    Unlike packBitsGeneric(), the whole row is converted.
 -----------------------------------------------------------------------------*/
     /*
-      We use MMX/SSE facilities that operate on 8 bytes at once to pack
-      the bits quickly.
-    
-      We use 2 MMX registers (no SSE registers).
+      We use 2 SSE registers.
     
       The key machine instructions are:
+        
+      PCMPGTB128  Packed CoMPare Greater Than Byte
     
-    
-      PCMPGTB  Packed CoMPare Greater Than Byte
-    
-        Compares 8 bytes in parallel
+        Compares 16 bytes in parallel
         Result is x00 if greater than, xFF if not for each byte       
     
-      PMOVMSKB Packed MOVe MaSK Byte 
+      PMOVMSKB128 Packed MOVe MaSK Byte 
     
-        Result is a byte of the MSBs of 8 bytes
-        x00 xFF x00 xFF xFF xFF x00 x00 --> 01011100B = 0x5C
+        Result is a byte of the MSBs of 16 bytes
+        x00 xFF x00 xFF xFF xFF x00 x00 xFF xFF xFF xFF x00 x00 x00 x00 
+        --> 0101110011110000B = 0x5CF0
         
-        The result is actually a 32 bit int, but the higher bits are
-        always 0.  (0x0000005C in the above case)
-    
-      EMMS     Empty MMx State
-    
-        Free MMX registers  
-    
+        The result is actually a 64 bit int, but the higher bits are
+        always 0.
     */
 
-
-    typedef char v8qi __attribute__ ((vector_size(8)));
-    typedef int di __attribute__ ((mode(DI)));
+    typedef char v16qi __attribute__ ((vector_size(16)));
 
     unsigned int col;
-    v8qi const zero64 =(v8qi)((di)0);  /* clear to zero */
-
-    for (col = 0; col + 7 < cols; col += 8) {
-
-        v8qi const compare =
-            __builtin_ia32_pcmpgtb(*(v8qi*) (&bitrow[col]), (v8qi) zero64);
-        uint32_t const backwardBlackMask =  __builtin_ia32_pmovmskb(compare);
-        unsigned char const blackMask = bitreverse[backwardBlackMask];
-
-        packedBits[col/8] = blackMask;
+    union {
+        v16qi    v16;
+        uint64_t i64[2];
+        unsigned char byte[16];
+    } bit128;
+
+    v16qi zero128;
+    zero128 = zero128 ^ zero128;   /* clear to zero */
+
+    for (col = 0; col + 15 < cols; col += 16) {
+        bit128.i64[0]=__builtin_bswap64( *(uint64_t*) &bitrow[col]);
+        bit128.i64[1]=__builtin_bswap64( *(uint64_t*) &bitrow[col+8]);
+
+        {
+            v16qi const compare =
+                __builtin_ia32_pcmpgtb128(bit128.v16, zero128);
+            uint16_t const blackMask = 
+                (uint16_t) __builtin_ia32_pmovmskb128(compare);
+
+            *(uint16_t *) & packedBits[col/8] = blackMask;
+        }
     }
-    *nextColP = col;
 
-    __builtin_ia32_emms();
+    if (cols % 16 > 0) {
+        unsigned int i, j;
 
+        bit128.v16 = bit128.v16 ^ bit128.v16;
+    
+        for (i = 0, j = col ; j < cols; ++i, ++j) 
+            bit128.byte[ (i&8) + 7-(i&7) ] = bitrow[j];
+      
+        {
+            v16qi const compare =
+                __builtin_ia32_pcmpgtb128( bit128.v16, zero128 );
+            uint16_t const blackMask =
+                __builtin_ia32_pmovmskb128( compare );
+
+            if ( cols%16 >8 )  /* Two partial bytes */
+                *(uint16_t *) & packedBits[col/8] = blackMask;
+            else              /* One partial byte */
+                packedBits[col/8] = (unsigned char) blackMask ;
+        }
+    }
 }
 #else
 /* Avoid undefined function warning; never actually called */
 
-#define packBitsWithMmxSse(a,b,c,d,e) packBitsGeneric(a,b,c,d,e)
+#define packBitsWithSse2(a,b,c,d) packBitsGeneric((a),(b),(c),(d),NULL)
 #endif
 
 
 
-
 static unsigned int
 bitValue(unsigned char const byteValue) {
 
@@ -212,18 +233,20 @@ writePbmRowRaw(FILE *      const fileP,
         pm_setjmpbuf(origJmpbufP);
         pm_longjmp();
     } else {
-        unsigned int nextCol;
 
         pm_setjmpbufsave(&jmpbuf, &origJmpbufP);
 
-        if (HAVE_GCC_MMXSSE)
-            packBitsWithMmxSse(fileP, bitrow, packedBits, cols, &nextCol);
-        else 
+        switch (PACKBITS_SSE) {
+        case 2: 
+            packBitsWithSse2(fileP, bitrow, packedBits, cols);
+            break;
+        default: {
+            unsigned int nextCol;
             packBitsGeneric(fileP, bitrow, packedBits, cols, &nextCol);
-
-        if (cols % 8 > 0)
-            packPartialBytes(bitrow, cols, nextCol, packedBits);
-        
+            if (cols % 8 > 0)
+                packPartialBytes(bitrow, cols, nextCol, packedBits);
+        }
+        }
         writePackedRawRow(fileP, packedBits, cols);
 
         pm_setjmpbuf(origJmpbufP);
diff --git a/pm_config.in.h b/pm_config.in.h
index c44bc72c..c2f75454 100644
--- a/pm_config.in.h
+++ b/pm_config.in.h
@@ -198,14 +198,14 @@ extern int rand();
   #endif
 #endif
 
-/* CONFIGURE: GNUC extensions are used in performance critical places
+/* CONFIGURE: GNU Compiler extensions are used in performance critical places
    when available.  Test whether they exist.
 
    Turn off by defining NO_GCC_BUILTINS.
 
-   Note that though these influence the code produced, the compiler
-   setting ultimately decides what operands are used.  If you
-   want a generic build, check the manual and adjust CFLAGS in
+   Note that though these influence the resulting Netpbm machine code, the
+   compiler setting ultimately decides what instruction set the compiler uses.
+   If you want a generic build, check the manual and adjust CFLAGS in
    config.mk accordingly.
 
    For example, if you want binaries that run on all Intel x86-32
@@ -213,6 +213,10 @@ extern int rand();
    config.mk is much better than setting NO_GCC_BUILTINS to 1.
    If you want to be extra sure use:
    "-march=i386 -mno-mmx -mno-sse -DNO_GCC_BUILTINS"
+
+   Gcc uses SSE and SSE2 instructions by default for AMD/Intel x86-64.
+   Tinkering with "-mno-sse" is not recommended for these machines.  If you
+   don't want SSE code, set NO_GCC_BUILTINS to 1.
 */
 
 #if defined(__GNUC__) && !defined(NO_GCC_BUILTINS)
@@ -221,12 +225,16 @@ extern int rand();
   #define GCCVERSION 0
 #endif
 
-#ifndef HAVE_GCC_MMXSSE
-#if GCCVERSION >=301 && defined(__MMX__) && defined(__SSE__)
-  #define HAVE_GCC_MMXSSE 1
-  /* Use GCC builtins to directly access MMX/SSE features */ 
+/* HAVE_GCC_SSE2 means the compiler has GCC builtins to directly access
+   SSE/SSE2 features.  This is different from whether the compiler generates
+   code that uses these features at all.
+*/
+
+#ifndef HAVE_GCC_SSE2
+#if GCCVERSION >=401 && defined(__SSE__) && defined(__SSE2__)
+  #define HAVE_GCC_SSE2 1
 #else
-  #define HAVE_GCC_MMXSSE 0
+  #define HAVE_GCC_SSE2 0
 #endif
 #endif
author	giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8>	2010-05-13 14:17:39 +0000
committer	giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8>	2010-05-13 14:17:39 +0000
commit	5d8e96684997df4783e50599d7de3aef7eaa2103 (patch)
tree	b3b3961df760d26aea48c8b50ab23df9a37b852e
parent	78f628415fdc9d3108de114499a748bac5f73d9e (diff)
download	netpbm-mirror-5d8e96684997df4783e50599d7de3aef7eaa2103.tar.gz netpbm-mirror-5d8e96684997df4783e50599d7de3aef7eaa2103.tar.xz netpbm-mirror-5d8e96684997df4783e50599d7de3aef7eaa2103.zip