about summary refs log tree commit diff
diff options
context:
space:
mode:
authorgiraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8>2013-10-30 18:57:38 +0000
committergiraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8>2013-10-30 18:57:38 +0000
commit9f8b7230b969fde154cf06de3ee36fe14636371b (patch)
treef1c74fed6644663218ea987c4bd4104100a13ec4
parent6229105f5ce31e34b08dc656a71a919f313a11d7 (diff)
downloadnetpbm-mirror-9f8b7230b969fde154cf06de3ee36fe14636371b.tar.gz
netpbm-mirror-9f8b7230b969fde154cf06de3ee36fe14636371b.tar.xz
netpbm-mirror-9f8b7230b969fde154cf06de3ee36fe14636371b.zip
Use SSE stuff with Clang as for GCC
git-svn-id: http://svn.code.sf.net/p/netpbm/code/trunk@2027 9d0c8265-081b-0410-96cb-a4ca84ce46f8
-rw-r--r--buildtools/Makefile3
-rw-r--r--common.mk22
-rw-r--r--config.mk.in7
-rw-r--r--converter/other/cameratopam/Makefile4
-rw-r--r--converter/other/pnmtopalm/Makefile2
-rw-r--r--doc/HISTORY5
-rw-r--r--editor/pamflip/Makefile6
-rw-r--r--lib/Makefile9
-rw-r--r--lib/libpbm3.c47
-rw-r--r--lib/util/Makefile11
-rw-r--r--pm_config.in.h23
-rw-r--r--urt/Makefile10
12 files changed, 100 insertions, 49 deletions
diff --git a/buildtools/Makefile b/buildtools/Makefile
index 10ac1a58..6a2e33da 100644
--- a/buildtools/Makefile
+++ b/buildtools/Makefile
@@ -26,7 +26,8 @@ endif
 libopt.o: libopt.c
 	$(CC_FOR_BUILD) -c -o $@ $(CFLAGS_FOR_BUILD) \
 	  -DSHLIBPREFIXLIST="\"$(SHLIBPREFIXLIST)\"" \
-	  $(STRIP_DLL_VERSION) $(EXPLICIT) $(CFLAGS_PERSONAL) $(CADD) \
+	  $(STRIP_DLL_VERSION) $(EXPLICIT) \
+	  $(CFLAGS_PERSONAL) $(CFLAGS) $(CADD) \
 	  $<
 
 typegen.o endiangen.o:%.o:%.c
diff --git a/common.mk b/common.mk
index 4077f2ea..c096bd6e 100644
--- a/common.mk
+++ b/common.mk
@@ -43,13 +43,11 @@
 # MERGEBINARIES: list of the programs that, in a merge build, are invoked
 #   via the merged Netpbm program
 # CC: C compiler command 
-# CPPFLAGS: C preprocessor options
-# CFLAGS: C compiler general options
+# CFLAGS_CONFIG: C compiler options from config.mk.
 # CFLAGS_TARGET: C compiler options for a particular target
 # LD: linker command
 # LINKERISCOMPILER: 'Y' if the linker invoked by LD is actually a compiler
 #   front end, so takes linker options in a different format
-# LDFLAGS: linker options 
 # LIBS or LOADLIBES: names of libraries to be added to all links
 # COMP_INCLUDES: Compiler option string to establish the search path for
 #   component-specific include files when compiling things or computing
@@ -64,6 +62,10 @@
 # is intended to be set on a make command line (e.g. 'make CADD=-g')
 # for options that apply just to a particular build.
 
+# In addition, there is CFLAGS, which is extra C compilation options and is
+# expected to be set via the make command line for a particular build.
+# Likewise, LDFLAGS for link-edit options.
+
 # In addition, there is CFLAGS_PERSONAL, which is extra C
 # compilation options and is expected to be set via environment variable
 # for options that are particular to the person doing the build and not
@@ -235,7 +237,19 @@ config:
 # -UNDEBUG (in any of various ways) to override this.
 #
 CFLAGS_ALL = \
-  -DNDEBUG $(CPPFLAGS) $(CFLAGS) $(CFLAGS_TARGET) $(CFLAGS_PERSONAL) $(CADD)
+  -DNDEBUG $(CPPFLAGS) $(CFLAGS_CONFIG) $(CFLAGS_TARGET) $(CFLAGS_PERSONAL) $(CFLAGS) $(CADD)
+
+ifeq ($(WANT_SSE),Y)
+  # The only two compilers we've seen that have the SSE capabilities that
+  # WANT_SSE requests are GCC and Clang, and they both have these options and
+  # require them in order for <emmintrin.h> to compile.  On some systems
+  # (x86_64, in our experience), these options are default, but on more
+  # traditional systems, they are not.  Note: __SSE2__ macro tells whether
+  # -msse2 is in effect.
+  CFLAGS_SSE = -msse -msse2
+else
+  CFLAGS_SSE =
+endif
 
 $(OBJECTS): %.o: %.c importinc
 #############################################################################
diff --git a/config.mk.in b/config.mk.in
index 307c6480..bc1d2804 100644
--- a/config.mk.in
+++ b/config.mk.in
@@ -93,7 +93,10 @@ HAVE_INT64 = Y
 # standard SSE intrinsics (operators such as '_mm_movemask_epi8').  SSE
 # instructions are faster than traditional instructions, but aren't available
 # on all CPUs.  Also, the standard intrinsics are not available in all
-# compilers.
+# compilers.  Even if you say N here, Netpbm may still be built with some
+# SSE exploitation (e.g. SSE floating point) because the compiler will 
+# do it automatically.  You can add a -nomsse or -nomsse2 option to
+# CFLAGS or CFLAGS_PERSONAL to stop that.
 WANT_SSE = N
 #WANT_SSE = Y
 
@@ -104,7 +107,7 @@ WANT_SSE = N
 # to use to compile and link build tools.
 CC_FOR_BUILD = $(CC)
 LD_FOR_BUILD = $(LD)
-CFLAGS_FOR_BUILD = $(CFLAGS)
+CFLAGS_FOR_BUILD = $(CFLAGS_CONFIG)
 LDFLAGS_FOR_BUILD = $(LDFLAGS)
 
 # MAKE is set automatically by Make to what was used to invoke Make.
diff --git a/converter/other/cameratopam/Makefile b/converter/other/cameratopam/Makefile
index 20a95aa2..4470d472 100644
--- a/converter/other/cameratopam/Makefile
+++ b/converter/other/cameratopam/Makefile
@@ -9,7 +9,7 @@ EXTERN_INCLUDES =
 ifneq ($(JPEGLIB),NONE)
   ifneq ($(JPEGHDR_DIR)x,x)
     EXTERN_INCLUDES += -I$(JPEGHDR_DIR)
-    CFLAGS += -DHAVE_JPEG
+    HAVE_JPEG_DEFINE = -DHAVE_JPEG
   endif
 endif
 
@@ -22,6 +22,8 @@ all: cameratopam
 OBJECTS = util.o identify.o cameratopam.o camera.o foveon.o decode.o \
 	canon.o ljpeg.o dng.o
 
+camera.o camera.o2: CFLAGS_TARGET = $(HAVE_JPEG_DEFINE)
+
 MERGE_OBJECTS =
 
 BINARIES = cameratopam
diff --git a/converter/other/pnmtopalm/Makefile b/converter/other/pnmtopalm/Makefile
index 7f99f95a..65790002 100644
--- a/converter/other/pnmtopalm/Makefile
+++ b/converter/other/pnmtopalm/Makefile
@@ -25,7 +25,7 @@ $(BINARIES): %: %.o palmcolormap.o $(NETPBMLIB) $(LIBOPT)
 	  $(MATHLIB) $(LDFLAGS) $(LDLIBS) $(RPATH) $(LADD)
 
 gen_palm_colormap : % : %.c palmcolormap.o
-	$(CC) -I importinc $(CPPFLAGS) $(CFLAGS) -o $@ \
+	$(CC) -I importinc $(CFLAGS_ALL) -o $@ \
 	  $< palmcolormap.o \
 	  $(LIBOPTS) $(MATHLIB) $(LDFLAGS) $(LDLIBS) $(LADD)
 
diff --git a/doc/HISTORY b/doc/HISTORY
index afc78da0..6b298ec9 100644
--- a/doc/HISTORY
+++ b/doc/HISTORY
@@ -15,7 +15,10 @@ not yet  BJH  Release 10.65.00
               Fix compile failure on system such as OpenBSD that don't have
               SIGWINCH and SIGIO.  Broken since 10.49 (December 2009).
 
-              Build: Use <emmintrins.h> interface for SSE intrinsics
+              Build: Use SSE2 vector instructions when compiling with Clang,
+              as done already with GCC.
+
+              Build: Use <emmintrin.h> interface for SSE intrinsics
               instead of GCC-specific versions.  Thanks Prophet of the Way
               <afu@wta.att.ne.jp>.
 
diff --git a/editor/pamflip/Makefile b/editor/pamflip/Makefile
index 497c5379..83e961a7 100644
--- a/editor/pamflip/Makefile
+++ b/editor/pamflip/Makefile
@@ -5,6 +5,8 @@ endif
 SUBDIR = editor/pamflip
 VPATH=.:$(SRCDIR)/$(SUBDIR)
 
+default: all
+
 include $(BUILDDIR)/config.mk
 
 SUBDIRS =
@@ -21,10 +23,12 @@ OBJECTS = $(PAMFLIP_OBJECTS)
 
 MERGE_OBJECTS = $(OBJECTS:%.o=%.o2)
 
+include $(SRCDIR)/common.mk
+
 .PHONY: all
 all: $(BINARIES) $(SUBDIRS:%=%/all)
 
-include $(SRCDIR)/common.mk
+pamflip_sse.o pamflip_sse.o2: CFLAGS_TARGET = $(CFLAGS_SSE)
 
 pamflip: $(PAMFLIP_OBJECTS) $(NETPBMLIB) $(LIBOPT)
 	$(LD) -o $@ $(PAMFLIP_OBJECTS) \
diff --git a/lib/Makefile b/lib/Makefile
index 8d9b3175..6512949f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -84,13 +84,12 @@ extra_staticlib: $(EXTRA_STATICLIB)
 # type, but request a static library in addition.
 #----------------------------------------------------------------------------
 
-# Note that the user may have configured -I options into CPPFLAGS/CFLAGS.
-CFLAGS_ALL = $(INCLUDES) -DNDEBUG $(CPPFLAGS) $(CFLAGS) $(CFLAGS_SHLIB) \
- $(CFLAGS_PERSONAL) $(CADD)
+$(LIBOBJECTS): CFLAGS_TARGET=$(CFLAGS_SHLIB)
+
+libpbm3.o: CFLAGS_TARGET+=$(CFLAGS_SSE)
 
 $(LIBOBJECTS): %.o: %.c importinc
-# We have to get the command all on one line to avoid messy make messages
-	$(CC) -c $(CFLAGS_ALL) -o $@ $<
+	$(CC) -c $(INCLUDES) $(CFLAGS_ALL) -o $@ $<
 
 MAJ = 11
 MIN = $(NETPBM_MINOR_RELEASE)
diff --git a/lib/libpbm3.c b/lib/libpbm3.c
index 3d846962..020e1558 100644
--- a/lib/libpbm3.c
+++ b/lib/libpbm3.c
@@ -16,21 +16,28 @@
 #include "pbm.h"
 
 #ifndef PACKBITS_SSE
-#if HAVE_GCC_SSE2 && HAVE_GCC_BSWAP && defined(__SSE2__)
+#if WANT_SSE && defined(__SSE2__) && HAVE_GCC_BSWAP
   #define PACKBITS_SSE 2
 #else
   #define PACKBITS_SSE 0
 #endif
 #endif
 
-/* HAVE_GCC_SSE2 means we have the means to use SSE CPU facilities
-   to make PBM raster processing faster.  GCC only.
+/* WANT_SSE means we want to use SSE CPU facilities to make PBM raster
+   processing faster.  This implies it's actually possible - i.e. the
+   build environment has <emmintrin.h>.
 
-   The GNU Compiler -msse2 option makes SSE/SSE2 available.
+   The GNU Compiler -msse2 option makes SSE/SSE2 available, and is
+   evidenced by __SSE2__.
    For x86-32 with SSE, "-msse2" must be explicitly given.
    For x86-64 and AMD64, "-msse2" is the default (from Gcc v.4.)
 */
 
+#if PACKBITS_SSE == 2
+  #include <emmintrin.h>
+#endif
+
+
 void
 pbm_writepbminit(FILE * const fileP, 
                  int    const cols, 
@@ -81,16 +88,28 @@ packBitsWithSse2(  FILE *          const fileP,
       PCMPGTB128  Packed CoMPare Greater Than Byte
     
         Compares 16 bytes in parallel
-        Result is x00 if greater than, xFF if not for each byte       
+        Result is x00 if greater than, xFF if not for each byte
+
     
       PMOVMSKB128 Packed MOVe MaSK Byte 
     
-        Result is a byte of the MSBs of 16 bytes
+        Result is 16 bits, the MSBs of 16 bytes
         x00 xFF x00 xFF xFF xFF x00 x00 xFF xFF xFF xFF x00 x00 x00 x00 
         --> 0101110011110000B = 0x5CF0
         
         The result is actually a 64 bit int, but the higher bits are
         always 0.
+
+      We use SSE instructions in "_mm_" form in favor of "__builtin_".
+      In GCC the "__builtin_" form is documented but "_mm_" is not.
+      Former versions of this source file used "__builtin_".  This was
+      changed to make possible compilation with clang, which does not
+      implement some "__builtin_" forms.
+
+      __builtin_ia32_pcmpgtb128 :  _mm_cmpgt_epi8
+      __builtin_ia32_pmovmskb128 : _mm_movemask_epi8
+
+      The conversion requires <emmintrin.h> .
     */
 
     typedef char v16qi __attribute__ ((vector_size(16)));
@@ -110,11 +129,10 @@ packBitsWithSse2(  FILE *          const fileP,
         bit128.i64[1]=__builtin_bswap64( *(uint64_t*) &bitrow[col+8]);
 
         {
-            v16qi const compare =
-                __builtin_ia32_pcmpgtb128(bit128.v16, zero128);
-            uint16_t const blackMask = 
-                (uint16_t) __builtin_ia32_pmovmskb128(compare);
-
+            v16qi const compare = (v16qi)
+                _mm_cmpgt_epi8((__m128i)bit128.v16, (__m128i) zero128);
+            uint16_t const blackMask = _mm_movemask_epi8 ((__m128i)compare);
+            
             *(uint16_t *) & packedBits[col/8] = blackMask;
         }
     }
@@ -128,10 +146,9 @@ packBitsWithSse2(  FILE *          const fileP,
             bit128.byte[ (i&8) + 7-(i&7) ] = bitrow[j];
       
         {
-            v16qi const compare =
-                __builtin_ia32_pcmpgtb128( bit128.v16, zero128 );
-            uint16_t const blackMask =
-                __builtin_ia32_pmovmskb128( compare );
+            v16qi const compare = (v16qi)
+                _mm_cmpgt_epi8((__m128i)bit128.v16, (__m128i) zero128);
+            uint16_t const blackMask = _mm_movemask_epi8 ((__m128i)compare);
 
             if ( cols%16 >8 )  /* Two partial bytes */
                 *(uint16_t *) & packedBits[col/8] = blackMask;
diff --git a/lib/util/Makefile b/lib/util/Makefile
index 5bf1995e..28dfddfe 100644
--- a/lib/util/Makefile
+++ b/lib/util/Makefile
@@ -5,6 +5,8 @@ endif
 SUBDIR = lib/util
 VPATH=.:$(SRCDIR)/$(SUBDIR)
 
+default:all
+
 include $(BUILDDIR)/config.mk
 
 # nstring is required for asprintf(), etc.  Also some systems don't have
@@ -22,13 +24,14 @@ UTILOBJECTS = \
 
 MERGE_OBJECTS =
 
+include $(SRCDIR)/common.mk
+
 all: $(UTILOBJECTS)
 
-include $(SRCDIR)/common.mk
+$(UTILOBJECTS): CFLAGS_TARGET=$(CFLAGS_SHLIB)
 
 $(UTILOBJECTS):%.o:%.c importinc
-	$(CC) -c $(INCLUDES) -DNDEBUG $(CPPFLAGS) $(CFLAGS) $(CFLAGS_SHLIB) \
-	  $(CFLAGS_PERSONAL) $(CADD) -o $@ $<
+	$(CC) -c $(INCLUDES) $(CFLAGS_ALL) -o $@ $<
 
 testnstring: test.c nstring.h nstring.o
-	$(CC) $(CFLAGS) $(CADD) -o $@ nstring.o $<
+	$(CC) $(CFLAGS_ALL) -o $@ nstring.o $<
diff --git a/pm_config.in.h b/pm_config.in.h
index f79d44c2..fd651e3a 100644
--- a/pm_config.in.h
+++ b/pm_config.in.h
@@ -188,7 +188,7 @@
    when available.  Test whether they exist.
 
    Prevent the build from exploiting these extensions by defining
-   NO_GCC_BUILTINS.
+   NO_GCC_UNIQUE.
 
    Before Netpbm 10.65 (December 2013), Netpbm used GCC compiler extensions
    to generate SSE code in Pamflip.  Starting in 10.65, Netpbm instead uses
@@ -206,7 +206,7 @@
    > cc --version
      Apple clang version 4.0 (tags/Apple/clang-421.0.60) (based on LLVM 3.1svn)
 
-   which masquerades as GCC 4.2.1, but it does not have SSE2 function
+   which masquerades as GCC 4.2.1, but it does not have SSE2 operator
    __builtin_ia32_pcmpeqb128 .
 
   On the other hand, research by Prophet of the Way in September 2012
@@ -214,20 +214,24 @@
   compiled successfully with SSE exploitation), but 3.1 does not.  He did
   not find any mention in documentation of that change.
 
+  At least some versions of Clang that do not have __builtin_ia32_pcmpeqb128
+  nonetheless have other GCC SSE2 operators, such as __builtin_ia32_pcmpgtb128.
+  We did not detect a pattern.
+
   See below on compilers other than GCC that set __GNUC__:
   http://sourceforge.net/apps/mediawiki/predef/index.php?title=Compilers
 */
-#if defined(__GNUC__) && !defined(__clang__) && !defined(NO_GCC_BUILTINS)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(NO_GCC_UNIQUE)
   #define GCCVERSION __GNUC__*100 + __GNUC_MINOR__
 #else
   #define GCCVERSION 0
 #endif
 
-/* HAVE_GCC_SSE2 means the compiler has GCC-specific builtins to directly
-   access SSE/SSE2 features.  This is different from whether the compiler
-   generates code that uses these features at all.  It is also different
-   from whether the compiler has the more standard operators defined in
-   <emmintrins.h>.
+/* HAVE_GCC_SSE2 means the compiler has all of the GCC-specific builtins to
+   directly access SSE/SSE2 features.  This is different from whether the
+   compiler generates code that uses these features at all.  It is also
+   different from whether the compiler has the more standard operators defined
+   in <emmintrins.h>.
 */
 
 #ifndef HAVE_GCC_SSE2
@@ -254,10 +258,9 @@
 #endif
 
 #ifndef HAVE_GCC_BSWAP
-#if GCCVERSION >=403
+#if GCCVERSION >=403 || defined(__clang__)
   #define HAVE_GCC_BSWAP 1
   /* Use __builtin_bswap32(), __builtin_bswap64() for endian conversion.
-     Available from GCC v 4.3 onward.
      NOTE: On intel CPUs this may produce the bswap operand which is not
      available on 80386. */
 #else
diff --git a/urt/Makefile b/urt/Makefile
index b94da1b2..0aef5290 100644
--- a/urt/Makefile
+++ b/urt/Makefile
@@ -5,6 +5,8 @@ endif
 SUBDIR = urt
 VPATH=.:$(SRCDIR)/$(SUBDIR)
 
+default: all
+
 include $(BUILDDIR)/config.mk
 
 LIBOBJECTS = Runput.o cmd_name.o \
@@ -15,6 +17,9 @@ LIBOBJECTS = Runput.o cmd_name.o \
 
 MERGE_OBJECTS =
 
+OMIT_URT_RULE = 1
+include $(SRCDIR)/common.mk
+
 all: librle.a
 
 librle.a: $(LIBOBJECTS)
@@ -24,11 +29,8 @@ librle.a: $(LIBOBJECTS)
 
 # Rule for objects.
 $(LIBOBJECTS): %.o: %.c importinc
-	$(CC) -c $(INCLUDES) -o $@ \
-	  $< $(CPPFLAGS) $(CFLAGS) $(CFLAGS_PERSONAL) $(CADD)
+	$(CC) -c $(INCLUDES) -o $@ $< $(CFLAGS_ALL)
 
 BINARIES =
 SCRIPTS =
 
-OMIT_URT_RULE = 1
-include $(SRCDIR)/common.mk