From 9f8b7230b969fde154cf06de3ee36fe14636371b Mon Sep 17 00:00:00 2001 From: giraffedata Date: Wed, 30 Oct 2013 18:57:38 +0000 Subject: Use SSE stuff with Clang as for GCC git-svn-id: http://svn.code.sf.net/p/netpbm/code/trunk@2027 9d0c8265-081b-0410-96cb-a4ca84ce46f8 --- buildtools/Makefile | 3 ++- common.mk | 22 ++++++++++++++--- config.mk.in | 7 ++++-- converter/other/cameratopam/Makefile | 4 ++- converter/other/pnmtopalm/Makefile | 2 +- doc/HISTORY | 5 +++- editor/pamflip/Makefile | 6 ++++- lib/Makefile | 9 +++---- lib/libpbm3.c | 47 ++++++++++++++++++++++++------------ lib/util/Makefile | 11 ++++++--- pm_config.in.h | 23 ++++++++++-------- urt/Makefile | 10 +++++--- 12 files changed, 100 insertions(+), 49 deletions(-) diff --git a/buildtools/Makefile b/buildtools/Makefile index 10ac1a58..6a2e33da 100644 --- a/buildtools/Makefile +++ b/buildtools/Makefile @@ -26,7 +26,8 @@ endif libopt.o: libopt.c $(CC_FOR_BUILD) -c -o $@ $(CFLAGS_FOR_BUILD) \ -DSHLIBPREFIXLIST="\"$(SHLIBPREFIXLIST)\"" \ - $(STRIP_DLL_VERSION) $(EXPLICIT) $(CFLAGS_PERSONAL) $(CADD) \ + $(STRIP_DLL_VERSION) $(EXPLICIT) \ + $(CFLAGS_PERSONAL) $(CFLAGS) $(CADD) \ $< typegen.o endiangen.o:%.o:%.c diff --git a/common.mk b/common.mk index 4077f2ea..c096bd6e 100644 --- a/common.mk +++ b/common.mk @@ -43,13 +43,11 @@ # MERGEBINARIES: list of the programs that, in a merge build, are invoked # via the merged Netpbm program # CC: C compiler command -# CPPFLAGS: C preprocessor options -# CFLAGS: C compiler general options +# CFLAGS_CONFIG: C compiler options from config.mk. # CFLAGS_TARGET: C compiler options for a particular target # LD: linker command # LINKERISCOMPILER: 'Y' if the linker invoked by LD is actually a compiler # front end, so takes linker options in a different format -# LDFLAGS: linker options # LIBS or LOADLIBES: names of libraries to be added to all links # COMP_INCLUDES: Compiler option string to establish the search path for # component-specific include files when compiling things or computing @@ -64,6 +62,10 @@ # is intended to be set on a make command line (e.g. 'make CADD=-g') # for options that apply just to a particular build. +# In addition, there is CFLAGS, which is extra C compilation options and is +# expected to be set via the make command line for a particular build. +# Likewise, LDFLAGS for link-edit options. + # In addition, there is CFLAGS_PERSONAL, which is extra C # compilation options and is expected to be set via environment variable # for options that are particular to the person doing the build and not @@ -235,7 +237,19 @@ config: # -UNDEBUG (in any of various ways) to override this. # CFLAGS_ALL = \ - -DNDEBUG $(CPPFLAGS) $(CFLAGS) $(CFLAGS_TARGET) $(CFLAGS_PERSONAL) $(CADD) + -DNDEBUG $(CPPFLAGS) $(CFLAGS_CONFIG) $(CFLAGS_TARGET) $(CFLAGS_PERSONAL) $(CFLAGS) $(CADD) + +ifeq ($(WANT_SSE),Y) + # The only two compilers we've seen that have the SSE capabilities that + # WANT_SSE requests are GCC and Clang, and they both have these options and + # require them in order for to compile. On some systems + # (x86_64, in our experience), these options are default, but on more + # traditional systems, they are not. Note: __SSE2__ macro tells whether + # -msse2 is in effect. + CFLAGS_SSE = -msse -msse2 +else + CFLAGS_SSE = +endif $(OBJECTS): %.o: %.c importinc ############################################################################# diff --git a/config.mk.in b/config.mk.in index 307c6480..bc1d2804 100644 --- a/config.mk.in +++ b/config.mk.in @@ -93,7 +93,10 @@ HAVE_INT64 = Y # standard SSE intrinsics (operators such as '_mm_movemask_epi8'). SSE # instructions are faster than traditional instructions, but aren't available # on all CPUs. Also, the standard intrinsics are not available in all -# compilers. +# compilers. Even if you say N here, Netpbm may still be built with some +# SSE exploitation (e.g. SSE floating point) because the compiler will +# do it automatically. You can add a -nomsse or -nomsse2 option to +# CFLAGS or CFLAGS_PERSONAL to stop that. WANT_SSE = N #WANT_SSE = Y @@ -104,7 +107,7 @@ WANT_SSE = N # to use to compile and link build tools. CC_FOR_BUILD = $(CC) LD_FOR_BUILD = $(LD) -CFLAGS_FOR_BUILD = $(CFLAGS) +CFLAGS_FOR_BUILD = $(CFLAGS_CONFIG) LDFLAGS_FOR_BUILD = $(LDFLAGS) # MAKE is set automatically by Make to what was used to invoke Make. diff --git a/converter/other/cameratopam/Makefile b/converter/other/cameratopam/Makefile index 20a95aa2..4470d472 100644 --- a/converter/other/cameratopam/Makefile +++ b/converter/other/cameratopam/Makefile @@ -9,7 +9,7 @@ EXTERN_INCLUDES = ifneq ($(JPEGLIB),NONE) ifneq ($(JPEGHDR_DIR)x,x) EXTERN_INCLUDES += -I$(JPEGHDR_DIR) - CFLAGS += -DHAVE_JPEG + HAVE_JPEG_DEFINE = -DHAVE_JPEG endif endif @@ -22,6 +22,8 @@ all: cameratopam OBJECTS = util.o identify.o cameratopam.o camera.o foveon.o decode.o \ canon.o ljpeg.o dng.o +camera.o camera.o2: CFLAGS_TARGET = $(HAVE_JPEG_DEFINE) + MERGE_OBJECTS = BINARIES = cameratopam diff --git a/converter/other/pnmtopalm/Makefile b/converter/other/pnmtopalm/Makefile index 7f99f95a..65790002 100644 --- a/converter/other/pnmtopalm/Makefile +++ b/converter/other/pnmtopalm/Makefile @@ -25,7 +25,7 @@ $(BINARIES): %: %.o palmcolormap.o $(NETPBMLIB) $(LIBOPT) $(MATHLIB) $(LDFLAGS) $(LDLIBS) $(RPATH) $(LADD) gen_palm_colormap : % : %.c palmcolormap.o - $(CC) -I importinc $(CPPFLAGS) $(CFLAGS) -o $@ \ + $(CC) -I importinc $(CFLAGS_ALL) -o $@ \ $< palmcolormap.o \ $(LIBOPTS) $(MATHLIB) $(LDFLAGS) $(LDLIBS) $(LADD) diff --git a/doc/HISTORY b/doc/HISTORY index afc78da0..6b298ec9 100644 --- a/doc/HISTORY +++ b/doc/HISTORY @@ -15,7 +15,10 @@ not yet BJH Release 10.65.00 Fix compile failure on system such as OpenBSD that don't have SIGWINCH and SIGIO. Broken since 10.49 (December 2009). - Build: Use interface for SSE intrinsics + Build: Use SSE2 vector instructions when compiling with Clang, + as done already with GCC. + + Build: Use interface for SSE intrinsics instead of GCC-specific versions. Thanks Prophet of the Way . diff --git a/editor/pamflip/Makefile b/editor/pamflip/Makefile index 497c5379..83e961a7 100644 --- a/editor/pamflip/Makefile +++ b/editor/pamflip/Makefile @@ -5,6 +5,8 @@ endif SUBDIR = editor/pamflip VPATH=.:$(SRCDIR)/$(SUBDIR) +default: all + include $(BUILDDIR)/config.mk SUBDIRS = @@ -21,10 +23,12 @@ OBJECTS = $(PAMFLIP_OBJECTS) MERGE_OBJECTS = $(OBJECTS:%.o=%.o2) +include $(SRCDIR)/common.mk + .PHONY: all all: $(BINARIES) $(SUBDIRS:%=%/all) -include $(SRCDIR)/common.mk +pamflip_sse.o pamflip_sse.o2: CFLAGS_TARGET = $(CFLAGS_SSE) pamflip: $(PAMFLIP_OBJECTS) $(NETPBMLIB) $(LIBOPT) $(LD) -o $@ $(PAMFLIP_OBJECTS) \ diff --git a/lib/Makefile b/lib/Makefile index 8d9b3175..6512949f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -84,13 +84,12 @@ extra_staticlib: $(EXTRA_STATICLIB) # type, but request a static library in addition. #---------------------------------------------------------------------------- -# Note that the user may have configured -I options into CPPFLAGS/CFLAGS. -CFLAGS_ALL = $(INCLUDES) -DNDEBUG $(CPPFLAGS) $(CFLAGS) $(CFLAGS_SHLIB) \ - $(CFLAGS_PERSONAL) $(CADD) +$(LIBOBJECTS): CFLAGS_TARGET=$(CFLAGS_SHLIB) + +libpbm3.o: CFLAGS_TARGET+=$(CFLAGS_SSE) $(LIBOBJECTS): %.o: %.c importinc -# We have to get the command all on one line to avoid messy make messages - $(CC) -c $(CFLAGS_ALL) -o $@ $< + $(CC) -c $(INCLUDES) $(CFLAGS_ALL) -o $@ $< MAJ = 11 MIN = $(NETPBM_MINOR_RELEASE) diff --git a/lib/libpbm3.c b/lib/libpbm3.c index 3d846962..020e1558 100644 --- a/lib/libpbm3.c +++ b/lib/libpbm3.c @@ -16,21 +16,28 @@ #include "pbm.h" #ifndef PACKBITS_SSE -#if HAVE_GCC_SSE2 && HAVE_GCC_BSWAP && defined(__SSE2__) +#if WANT_SSE && defined(__SSE2__) && HAVE_GCC_BSWAP #define PACKBITS_SSE 2 #else #define PACKBITS_SSE 0 #endif #endif -/* HAVE_GCC_SSE2 means we have the means to use SSE CPU facilities - to make PBM raster processing faster. GCC only. +/* WANT_SSE means we want to use SSE CPU facilities to make PBM raster + processing faster. This implies it's actually possible - i.e. the + build environment has . - The GNU Compiler -msse2 option makes SSE/SSE2 available. + The GNU Compiler -msse2 option makes SSE/SSE2 available, and is + evidenced by __SSE2__. For x86-32 with SSE, "-msse2" must be explicitly given. For x86-64 and AMD64, "-msse2" is the default (from Gcc v.4.) */ +#if PACKBITS_SSE == 2 + #include +#endif + + void pbm_writepbminit(FILE * const fileP, int const cols, @@ -81,16 +88,28 @@ packBitsWithSse2( FILE * const fileP, PCMPGTB128 Packed CoMPare Greater Than Byte Compares 16 bytes in parallel - Result is x00 if greater than, xFF if not for each byte + Result is x00 if greater than, xFF if not for each byte + PMOVMSKB128 Packed MOVe MaSK Byte - Result is a byte of the MSBs of 16 bytes + Result is 16 bits, the MSBs of 16 bytes x00 xFF x00 xFF xFF xFF x00 x00 xFF xFF xFF xFF x00 x00 x00 x00 --> 0101110011110000B = 0x5CF0 The result is actually a 64 bit int, but the higher bits are always 0. + + We use SSE instructions in "_mm_" form in favor of "__builtin_". + In GCC the "__builtin_" form is documented but "_mm_" is not. + Former versions of this source file used "__builtin_". This was + changed to make possible compilation with clang, which does not + implement some "__builtin_" forms. + + __builtin_ia32_pcmpgtb128 : _mm_cmpgt_epi8 + __builtin_ia32_pmovmskb128 : _mm_movemask_epi8 + + The conversion requires . */ typedef char v16qi __attribute__ ((vector_size(16))); @@ -110,11 +129,10 @@ packBitsWithSse2( FILE * const fileP, bit128.i64[1]=__builtin_bswap64( *(uint64_t*) &bitrow[col+8]); { - v16qi const compare = - __builtin_ia32_pcmpgtb128(bit128.v16, zero128); - uint16_t const blackMask = - (uint16_t) __builtin_ia32_pmovmskb128(compare); - + v16qi const compare = (v16qi) + _mm_cmpgt_epi8((__m128i)bit128.v16, (__m128i) zero128); + uint16_t const blackMask = _mm_movemask_epi8 ((__m128i)compare); + *(uint16_t *) & packedBits[col/8] = blackMask; } } @@ -128,10 +146,9 @@ packBitsWithSse2( FILE * const fileP, bit128.byte[ (i&8) + 7-(i&7) ] = bitrow[j]; { - v16qi const compare = - __builtin_ia32_pcmpgtb128( bit128.v16, zero128 ); - uint16_t const blackMask = - __builtin_ia32_pmovmskb128( compare ); + v16qi const compare = (v16qi) + _mm_cmpgt_epi8((__m128i)bit128.v16, (__m128i) zero128); + uint16_t const blackMask = _mm_movemask_epi8 ((__m128i)compare); if ( cols%16 >8 ) /* Two partial bytes */ *(uint16_t *) & packedBits[col/8] = blackMask; diff --git a/lib/util/Makefile b/lib/util/Makefile index 5bf1995e..28dfddfe 100644 --- a/lib/util/Makefile +++ b/lib/util/Makefile @@ -5,6 +5,8 @@ endif SUBDIR = lib/util VPATH=.:$(SRCDIR)/$(SUBDIR) +default:all + include $(BUILDDIR)/config.mk # nstring is required for asprintf(), etc. Also some systems don't have @@ -22,13 +24,14 @@ UTILOBJECTS = \ MERGE_OBJECTS = +include $(SRCDIR)/common.mk + all: $(UTILOBJECTS) -include $(SRCDIR)/common.mk +$(UTILOBJECTS): CFLAGS_TARGET=$(CFLAGS_SHLIB) $(UTILOBJECTS):%.o:%.c importinc - $(CC) -c $(INCLUDES) -DNDEBUG $(CPPFLAGS) $(CFLAGS) $(CFLAGS_SHLIB) \ - $(CFLAGS_PERSONAL) $(CADD) -o $@ $< + $(CC) -c $(INCLUDES) $(CFLAGS_ALL) -o $@ $< testnstring: test.c nstring.h nstring.o - $(CC) $(CFLAGS) $(CADD) -o $@ nstring.o $< + $(CC) $(CFLAGS_ALL) -o $@ nstring.o $< diff --git a/pm_config.in.h b/pm_config.in.h index f79d44c2..fd651e3a 100644 --- a/pm_config.in.h +++ b/pm_config.in.h @@ -188,7 +188,7 @@ when available. Test whether they exist. Prevent the build from exploiting these extensions by defining - NO_GCC_BUILTINS. + NO_GCC_UNIQUE. Before Netpbm 10.65 (December 2013), Netpbm used GCC compiler extensions to generate SSE code in Pamflip. Starting in 10.65, Netpbm instead uses @@ -206,7 +206,7 @@ > cc --version Apple clang version 4.0 (tags/Apple/clang-421.0.60) (based on LLVM 3.1svn) - which masquerades as GCC 4.2.1, but it does not have SSE2 function + which masquerades as GCC 4.2.1, but it does not have SSE2 operator __builtin_ia32_pcmpeqb128 . On the other hand, research by Prophet of the Way in September 2012 @@ -214,20 +214,24 @@ compiled successfully with SSE exploitation), but 3.1 does not. He did not find any mention in documentation of that change. + At least some versions of Clang that do not have __builtin_ia32_pcmpeqb128 + nonetheless have other GCC SSE2 operators, such as __builtin_ia32_pcmpgtb128. + We did not detect a pattern. + See below on compilers other than GCC that set __GNUC__: http://sourceforge.net/apps/mediawiki/predef/index.php?title=Compilers */ -#if defined(__GNUC__) && !defined(__clang__) && !defined(NO_GCC_BUILTINS) +#if defined(__GNUC__) && !defined(__clang__) && !defined(NO_GCC_UNIQUE) #define GCCVERSION __GNUC__*100 + __GNUC_MINOR__ #else #define GCCVERSION 0 #endif -/* HAVE_GCC_SSE2 means the compiler has GCC-specific builtins to directly - access SSE/SSE2 features. This is different from whether the compiler - generates code that uses these features at all. It is also different - from whether the compiler has the more standard operators defined in - . +/* HAVE_GCC_SSE2 means the compiler has all of the GCC-specific builtins to + directly access SSE/SSE2 features. This is different from whether the + compiler generates code that uses these features at all. It is also + different from whether the compiler has the more standard operators defined + in . */ #ifndef HAVE_GCC_SSE2 @@ -254,10 +258,9 @@ #endif #ifndef HAVE_GCC_BSWAP -#if GCCVERSION >=403 +#if GCCVERSION >=403 || defined(__clang__) #define HAVE_GCC_BSWAP 1 /* Use __builtin_bswap32(), __builtin_bswap64() for endian conversion. - Available from GCC v 4.3 onward. NOTE: On intel CPUs this may produce the bswap operand which is not available on 80386. */ #else diff --git a/urt/Makefile b/urt/Makefile index b94da1b2..0aef5290 100644 --- a/urt/Makefile +++ b/urt/Makefile @@ -5,6 +5,8 @@ endif SUBDIR = urt VPATH=.:$(SRCDIR)/$(SUBDIR) +default: all + include $(BUILDDIR)/config.mk LIBOBJECTS = Runput.o cmd_name.o \ @@ -15,6 +17,9 @@ LIBOBJECTS = Runput.o cmd_name.o \ MERGE_OBJECTS = +OMIT_URT_RULE = 1 +include $(SRCDIR)/common.mk + all: librle.a librle.a: $(LIBOBJECTS) @@ -24,11 +29,8 @@ librle.a: $(LIBOBJECTS) # Rule for objects. $(LIBOBJECTS): %.o: %.c importinc - $(CC) -c $(INCLUDES) -o $@ \ - $< $(CPPFLAGS) $(CFLAGS) $(CFLAGS_PERSONAL) $(CADD) + $(CC) -c $(INCLUDES) -o $@ $< $(CFLAGS_ALL) BINARIES = SCRIPTS = -OMIT_URT_RULE = 1 -include $(SRCDIR)/common.mk -- cgit 1.4.1