diff options
author | giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8> | 2010-05-18 02:37:18 +0000 |
---|---|---|
committer | giraffedata <giraffedata@9d0c8265-081b-0410-96cb-a4ca84ce46f8> | 2010-05-18 02:37:18 +0000 |
commit | 60a0d64307394bad62684c32bbccf89552d15090 (patch) | |
tree | ba89fd811ea731b9a0cc0e64ff4fe9c05e92f0b2 | |
parent | 559136d4679d53037e4b74dd3f833b2ea89ccc05 (diff) | |
download | netpbm-mirror-60a0d64307394bad62684c32bbccf89552d15090.tar.gz netpbm-mirror-60a0d64307394bad62684c32bbccf89552d15090.tar.xz netpbm-mirror-60a0d64307394bad62684c32bbccf89552d15090.zip |
Pamflip PBM speedup with SSE et al
git-svn-id: http://svn.code.sf.net/p/netpbm/code/trunk@1214 9d0c8265-081b-0410-96cb-a4ca84ce46f8
-rw-r--r-- | doc/HISTORY | 5 | ||||
-rw-r--r-- | editor/Makefile | 5 | ||||
-rw-r--r-- | editor/pamflip/Makefile | 32 | ||||
-rw-r--r-- | editor/pamflip/flip.h | 16 | ||||
-rw-r--r-- | editor/pamflip/pamflip.c (renamed from editor/pamflip.c) | 82 | ||||
-rw-r--r-- | editor/pamflip/pamflip.test (renamed from editor/pamflip.test) | 0 | ||||
-rw-r--r-- | editor/pamflip/pamflip_sse.c | 345 | ||||
-rw-r--r-- | editor/pamflip/pamflip_sse.h | 12 |
8 files changed, 464 insertions, 33 deletions
diff --git a/doc/HISTORY b/doc/HISTORY index ed422a16..493b04a2 100644 --- a/doc/HISTORY +++ b/doc/HISTORY @@ -19,9 +19,14 @@ not yet BJH Release 10.51.00 pnmsmooth: Don't display pnmconvol messages (i.e. run pnmconvol with -quiet). + pamflip: speedup for PBM. Use SSE2 and skip some idempotent + pixel movement. Thanks Prophet of the Way <afu@wta.att.ne.jp>. + libnetpbm, various PBM programs: Use SSE insted of MMX. Thanks Prophet of the Way <afu@wta.att.ne.jp>. + libnetpbm: remove bogus assertion. + pnmtops: fix bug: 12 bits per sample output when 8 would do. Introduced in 10.40. diff --git a/editor/Makefile b/editor/Makefile index 29cdcab1..52b2b0bc 100644 --- a/editor/Makefile +++ b/editor/Makefile @@ -7,7 +7,7 @@ VPATH=.:$(SRCDIR)/$(SUBDIR) include $(BUILDDIR)/config.mk -SUBDIRS = specialty +SUBDIRS = pamflip specialty # We tend to separate out the build targets so that we don't have # any more dependencies for a given target than it really needs. @@ -19,7 +19,7 @@ SUBDIRS = specialty PORTBINARIES = pamaddnoise pambackground pamcomp pamcut \ pamdice pamditherbw pamedge \ pamenlarge \ - pamflip pamfunc pammasksharpen \ + pamfunc pammasksharpen \ pampaintspill pamperspective \ pamscale pamsistoaglyph pamstretch pamthreshold pamundice \ pbmclean pbmmask pbmpscale pbmreduce \ @@ -42,7 +42,6 @@ PORTBINARIES = pamaddnoise pambackground pamcomp pamcut \ NOMERGEBINARIES = MERGEBINARIES = $(PORTBINARIES) - BINARIES = $(MERGEBINARIES) $(NOMERGEBINARIES) SCRIPTS = pnmflip ppmfade ppmquant ppmquantall ppmshadow \ pamstretch-gen pnmmargin pnmquant diff --git a/editor/pamflip/Makefile b/editor/pamflip/Makefile new file mode 100644 index 00000000..497c5379 --- /dev/null +++ b/editor/pamflip/Makefile @@ -0,0 +1,32 @@ +ifeq ($(SRCDIR)x,x) + SRCDIR = $(CURDIR)/../.. + BUILDDIR = $(SRCDIR) +endif +SUBDIR = editor/pamflip +VPATH=.:$(SRCDIR)/$(SUBDIR) + +include $(BUILDDIR)/config.mk + +SUBDIRS = + +MERGEBINARIES = pamflip + +BINARIES = pamflip + +SCRIPTS = + +PAMFLIP_OBJECTS = pamflip.o pamflip_sse.o + +OBJECTS = $(PAMFLIP_OBJECTS) + +MERGE_OBJECTS = $(OBJECTS:%.o=%.o2) + +.PHONY: all +all: $(BINARIES) $(SUBDIRS:%=%/all) + +include $(SRCDIR)/common.mk + +pamflip: $(PAMFLIP_OBJECTS) $(NETPBMLIB) $(LIBOPT) + $(LD) -o $@ $(PAMFLIP_OBJECTS) \ + $(shell $(LIBOPT) $(NETPBMLIB)) \ + $(MATHLIB) $(LDFLAGS) $(LDLIBS) $(RPATH) $(LADD) diff --git a/editor/pamflip/flip.h b/editor/pamflip/flip.h new file mode 100644 index 00000000..612a7f84 --- /dev/null +++ b/editor/pamflip/flip.h @@ -0,0 +1,16 @@ +#ifndef FLIP_H_INCLUDED +#define FLIP_H_INCLUDED + +struct xformCore { + /* a b + c d + */ + int a; /* -1, 0, or 1 */ + int b; /* -1, 0, or 1 */ + int c; /* -1, 0, or 1 */ + int d; /* -1, 0, or 1 */ +}; + + + +#endif diff --git a/editor/pamflip.c b/editor/pamflip/pamflip.c index b8b079f0..1c758d67 100644 --- a/editor/pamflip.c +++ b/editor/pamflip/pamflip.c @@ -26,6 +26,10 @@ transformRowsBottomTopNonPbm() non-PBM image with bottom-top transformation (also works for PBM, but we don't use it) + transformRowsToColumnsPbmSse() + PBM image with column-for-row transformation + requires Intel/AMD x86 SSE2 + (can only do 90 degree/xy flips) transformPbm() PBM image with column-for-row transformation (also works for all other transformations, but we don't use it) @@ -68,8 +72,19 @@ #include "nstring.h" #include "bitreverse.h" +#include "flip.h" +#include "pamflip_sse.h" + enum xformType {LEFTRIGHT, TOPBOTTOM, TRANSPOSE}; +#ifndef SIMD_PBM_TRANSPOSITION + #if HAVE_GCC_SSE2 && defined(__SSE2__) + #define SIMD_PBM_TRANSPOSITION 1 + #else + #define SIMD_PBM_TRANSPOSITION 0 + #endif +#endif + static void parseXformOpt(const char * const xformOpt, unsigned int * const xformCountP, @@ -115,25 +130,13 @@ parseXformOpt(const char * const xformOpt, -/* See transformPoint() for an explanation of the transform matrix types. - The difference between the two types is that 'xformCore' is particular - to the source image dimensions and can be used to do the transformation, - while 'xformCore' is independent of the source image and just - tells what kind of transformation. +/* See transformPoint() for an explanation of the transform matrix types. The + difference between xformCore and xformMatrix is that 'xformCore' is + particular to the source image dimensions and can be used to do the + transformation, while 'xformCore' is independent of the source image and + just tells what kind of transformation. */ -struct xformCore { - /* a b - c d - */ - int a; /* -1, 0, or 1 */ - int b; /* -1, 0, or 1 */ - int c; /* -1, 0, or 1 */ - int d; /* -1, 0, or 1 */ -}; - - - struct xformMatrix { /* a b 0 c d 0 @@ -397,6 +400,9 @@ bitOrderReverse(unsigned char * const bitrow, /*---------------------------------------------------------------------------- Reverse the bits in a packed pbm row (1 bit per pixel). I.e. the leftmost bit becomes the rightmost, etc. + + Exchange pixels in units of eight. If both are zero, skip instead of + exchanging zeros. -----------------------------------------------------------------------------*/ unsigned int const lastfullByteIdx = cols/8 - 1; @@ -407,11 +413,14 @@ bitOrderReverse(unsigned char * const bitrow, bitrow[0] = bitreverse[bitrow[0]] << (8-cols); else if (cols % 8 == 0) { unsigned int i, j; - for (i = 0, j = lastfullByteIdx; i <= j; ++i, --j) { - unsigned char const t = bitreverse[bitrow[j]]; - bitrow[j] = bitreverse[bitrow[i]]; - bitrow[i] = t; - } + for (i = 0, j = lastfullByteIdx; i <= j; ++i, --j) + if ((bitrow[j] | bitrow[i]) == 0) { + /* Both are 0x00 - skip */ + } else { + unsigned char const t = bitreverse[bitrow[j]]; + bitrow[j] = bitreverse[bitrow[i]]; + bitrow[i] = t; + } } else { unsigned int const m = cols % 8; @@ -422,18 +431,23 @@ bitOrderReverse(unsigned char * const bitrow, unsigned char th, tl; /* 16 bit temp ( th << 8 | tl ) */ tl = 0; for (i = 0, j = lastfullByteIdx+1; i <= lastfullByteIdx/2; ++i, --j) { - th = bitreverse[bitrow[i]]; - bitrow[i] = - bitreverse[0xff & ((bitrow[j-1] << 8 | bitrow[j]) >> (8-m))]; - bitrow[j] = 0xff & ((th << 8 | tl) >> m); - tl = th; + if( (tl | bitrow[i] | bitrow[j] | bitrow[j-1]) != 0) { + /* Skip if both are 0x00 */ + th = bitreverse[bitrow[i]]; + bitrow[i] = + bitreverse[0xff & ((bitrow[j-1] << 8 + | bitrow[j]) >> (8-m))]; + bitrow[j] = 0xff & ((th << 8 | tl) >> m); + tl = th; + } } - if (i == j) + if (i == j && (bitrow[i] | tl) != 0) { /* bitrow[] has an odd number of bytes (an even number of full bytes; lastfullByteIdx is odd), so we did all but the center byte above. We do the center byte now. */ bitrow[j] = 0xff & ((bitreverse[bitrow[i]] << 8 | tl) >> m); + } } } @@ -636,6 +650,10 @@ writeRaster(struct pam * const pamP, + + + + static void transformPbmGen(struct pam * const inpamP, struct pam * const outpamP, @@ -1125,11 +1143,15 @@ transformPbm(struct pam * const inpamP, through them only twice, so there is no page thrashing concern. */ transformRowsBottomTopPbm(inpamP, outpamP, xform.a == -1); - } else + } else { /* This is a column-for-row type of transformation, which requires complex traversal of an in-memory image. */ - transformPbmGen(inpamP, outpamP, xform); + if (SIMD_PBM_TRANSPOSITION == 1) + pamflip_transformRowsToColumnsPbmSse(inpamP, outpamP, xform); + else + transformPbmGen(inpamP, outpamP, xform); + } } diff --git a/editor/pamflip.test b/editor/pamflip/pamflip.test index 96e889ea..96e889ea 100644 --- a/editor/pamflip.test +++ b/editor/pamflip/pamflip.test diff --git a/editor/pamflip/pamflip_sse.c b/editor/pamflip/pamflip_sse.c new file mode 100644 index 00000000..3a2d28d7 --- /dev/null +++ b/editor/pamflip/pamflip_sse.c @@ -0,0 +1,345 @@ +#include <assert.h> + +#include "pm_config.h" +#include "pm_c_util.h" +#include "pam.h" + +#include "flip.h" + +#include "pamflip_sse.h" + +#if HAVE_GCC_SSE2 && defined(__SSE2__) + +/*---------------------------------------------------------------------------- + This is a specialized routine for row-for-column PBM transformations. + (-cw, -ccw, -xy). It requires GCC (>= v. 4.0.4) and SSE2. + + In each cycle, we read sixteen rows from the input. We process tbis band + left to right in blocks 8 pixels wide. We use the SSE2 instruction + pmovmskb128, which reports the MSB of each byte in a 16 byte array, for + fast processing. We place the 8x16 block into a 16 byte array, and + pmovmskb128 reports the 16 pixels on the left edge in one instruction + execution. pslldi128 shifts the array contents leftward. + + The following routines can write both in both directions (left and right) + into the output rows. They do this by controlling the vertical stacking + order when the make the 8x16 blocks. + + Function transpose1to15Bitrows() is for handling the partial bits of each + output row. They can come from either the top or bottom of the vertical + input colunm, but they always go to the right end of the output rows. + + transformRowsToColumnsPbm() does not have any instructions unique to + GCC or SSE. It is possible to write a non-SSE version by providing + generic versions of transpose16Bitrows() and transpose1to15Bitrows() . + This is just a matter of replacing the V16 union with a plain uchar + array and writing an emulation for __builtin_pmovmskb128() . + + Further enhancement should be possible by employing wider bands, + larger blocks as wider SIMD registers become available. Another + method is checking for white blocks and recording them in a small + array and condensing writes into the output raster array. +-----------------------------------------------------------------------------*/ + +typedef char v16qi __attribute__ ((vector_size (16))); +typedef int v4di __attribute__ ((vector_size (16))); + +union V16 { + v16qi v; + v4di d; + unsigned char i[16]; +}; + +/* Beware when making modifications to code which involve v16qi, v4di, V16. + Certain versions of GCC get stuck with the following: + + (1) Type mismatches between v16qi and v4di. Avoid them with casts. + + (2) Converions from a 16 byte array to v16qi (or union V16) by cast. + __vector__ variables have to be vector from the start. + + (3) union V16 as a register variable. + + Some GCC versions emit warnings, others abort with error. +*/ + + + +static void +transpose1to15Bitrows(unsigned int const cols, + unsigned int const rows, + bit ** const inrow, + uint16_t ** const outplane, + int const xdir) { +/*-------------------------------------------------------------------------- + Convert input rows to output columns. For handling partial rows. + Note that output from this always goes to the right edge of the image. +----------------------------------------------------------------------------*/ + unsigned int const outcol16 = (rows-1)/16; + + unsigned int col; + + union V16 v16; + v16.v = v16.v ^ v16.v; /* clear to zero */ + + for (col = 0; col < cols; ++col) { + unsigned int const outrow = col; + + if (col % 8 == 0) { + unsigned int i; + for (i = 0; i < rows % 16; ++i) { + int const idx = (xdir > 0) ? + (i&8) + 7-(i&7) : /* output left to right */ + (24- rows%16 +i) %16; /* right to left */ + v16.i [idx] = inrow[i][col/8]; + } + } + /* read 16 bits from left edge of block; write to output column */ + outplane[outrow][outcol16] = __builtin_ia32_pmovmskb128(v16.v); + v16.d = __builtin_ia32_pslldi128(v16.d, 1); + } +} + + + +static void +transpose16Bitrows(unsigned int const cols, + unsigned int const rows, + const bit * const block[16], + uint16_t ** const outplane, + unsigned int const outcol16) { +/*-------------------------------------------------------------------------- + Convert input rows to output columns. Works in units of 8x16. + + Uses pre-calculated pointers ( block[i][col8] ) instead of + (xdir > 0) ? (i & 0x08) + 7 - (i & 0x07) : (24 - rows % 16 +i) % 16 + for efficiency. + + We avoid using union V16 to keep the value in a register. (When we do so, + GCC (4.2, 4.4) sees the suffix x of v16.i[x] and apparently decides that + the variable has to be addressable and therefore needs to be placed into + memory.) + ----------------------------------------------------------------------------*/ + unsigned int col; + + for (col = 0; col + 7 < cols; col += 8) { /* Unrolled loop */ + unsigned int const col8 = col / 8; + + unsigned int outrow; + unsigned int i; + + register v16qi vReg = { + block[0][col8], block[1][col8], + block[2][col8], block[3][col8], + block[4][col8], block[5][col8], + block[6][col8], block[7][col8], + block[8][col8], block[9][col8], + block[10][col8], block[11][col8], + block[12][col8], block[13][col8], + block[14][col8], block[15][col8] }; + + outrow = col; /* initial value */ + + for (i = 0; i < 7; ++i) { + /* GCC (>=4.2) automatically unrolls this loop */ + outplane[outrow++][outcol16] = __builtin_ia32_pmovmskb128(vReg); + vReg = (v16qi)__builtin_ia32_pslldi128 ((v4di)vReg, 1); + } + outplane[outrow][outcol16] = __builtin_ia32_pmovmskb128(vReg); + } + + if (col < cols) { /* Transpose remaining pixels at end of input rows. */ + unsigned int const col8 = col / 8; + register v16qi vReg = { + block[0][col8], block[1][col8], + block[2][col8], block[3][col8], + block[4][col8], block[5][col8], + block[6][col8], block[7][col8], + block[8][col8], block[9][col8], + block[10][col8], block[11][col8], + block[12][col8], block[13][col8], + block[14][col8], block[15][col8] }; + + for ( ; col < cols; ++col) { + unsigned int const outrow = col; + + outplane[outrow][outcol16] = __builtin_ia32_pmovmskb128(vReg); + vReg = (v16qi)__builtin_ia32_pslldi128((v4di)vReg, 1); + } + } +} + + + +static void +analyzeBlock(struct pam * const inpamP, + bit ** const inrow, + int const xdir, + const bit ** const block, + unsigned int * const topOfFullBlockP, + unsigned int * const outcol16P) { + + if (xdir > 0){ + /* Write output columns left to right */ + unsigned int i; + for (i = 0; i < 16; ++i) + block[i] = inrow[(i & 0x8) + 7 - (i & 0x7)]; + *topOfFullBlockP = 0; + *outcol16P = 0; + } else { + /* Write output columns right to left */ + *topOfFullBlockP = inpamP->height % 16; + + if (inpamP->height >= 16) { + unsigned int i; + for (i = 0; i < 16; ++i) + block[i]= inrow[((i & 0x8) ^ 0x8) + (i & 0x7)]; + *outcol16P = inpamP->height/16 - 1; + } else + *outcol16P = 0; + } +} + + + +static void +doPartialBlockTop(struct pam * const inpamP, + bit ** const inrow, + int const xdir, + unsigned int const topOfFullBlock, + uint16_t ** const outplane) { + + if (topOfFullBlock > 0) { + unsigned int row; + for (row = 0; row < topOfFullBlock; ++row) + pbm_readpbmrow_packed(inpamP->file, inrow[row], + inpamP->width, inpamP->format); + + transpose1to15Bitrows(inpamP->width, inpamP->height, + inrow, outplane, xdir); + /* Transpose partial rows on top of input. Place on right edge of + output. + */ + } +} + + + +static void +doFullBlocks(struct pam * const inpamP, + bit ** const inrow, + int const xdir, + const bit * const block[16], + unsigned int const topOfFullBlock, + unsigned int const initOutcol16, + uint16_t ** const outplane) { + + unsigned int row; + unsigned int outcol16; + unsigned int modrow; + /* Number of current row within buffer */ + + for (row = topOfFullBlock, outcol16 = initOutcol16, modrow = 0; + row < inpamP->height; + ++row) { + + pbm_readpbmrow_packed(inpamP->file, inrow[modrow], + inpamP->width, inpamP->format); + ++modrow; + if (modrow == 16) { + /* 16 row buffer is full. Transpose. */ + modrow = 0; + + transpose16Bitrows(inpamP->width, inpamP->height, + block, outplane, outcol16); + outcol16 += xdir; + } + } +} + + + +static void +doPartialBlockBottom(struct pam * const inpamP, + bit ** const inrow, + int const xdir, + uint16_t ** const outplane) { + + if (xdir > 0 && inpamP->height % 16 > 0) + transpose1to15Bitrows(inpamP->width, inpamP->height, inrow, + outplane, xdir); + /* Transpose partial rows on bottom of input. Place on right edge of + output. + */ +} + + + +static void +writeOut(struct pam * const outpamP, + uint16_t ** const outplane, + int const ydir) { + + unsigned int row; + + for (row = 0; row < outpamP->height; ++row) { + unsigned int const outrow = (ydir > 0) ? + row : + outpamP->height - row - 1; /* reverse order */ + + pbm_writepbmrow_packed(stdout, (bit *)outplane[outrow], + outpamP->width, 0); + } +} + + + +void +pamflip_transformRowsToColumnsPbmSse(struct pam * const inpamP, + struct pam * const outpamP, + struct xformCore const xformCore) { +/*---------------------------------------------------------------------------- + This is a specialized routine for row-for-column PBM transformations. + (-cw, -ccw, -xy). +-----------------------------------------------------------------------------*/ + int const xdir = xformCore.c; + /* Input top: output left (+1)/ right (-1) */ + int const ydir = xformCore.b; + /* Input left: output top (+1)/ bottom (-1) */ + + bit ** inrow; + uint16_t ** outplane; + const bit * block[16]; + unsigned int topOfFullBlock; + unsigned int outcol16; + + inrow = pbm_allocarray_packed( inpamP->width, 16); + outplane = (uint16_t **)pbm_allocarray_packed(outpamP->width + 15, + outpamP->height); + /* We write to the output array in 16 bit units. Add margin (15). */ + + analyzeBlock(inpamP, inrow, xdir, block, &topOfFullBlock, &outcol16); + + doPartialBlockTop(inpamP, inrow, xdir, topOfFullBlock, outplane); + + doFullBlocks(inpamP, inrow, xdir, block, + topOfFullBlock, outcol16, outplane); + + doPartialBlockBottom(inpamP, inrow, xdir, outplane); + + writeOut(outpamP, outplane, ydir); + + pbm_freearray(outplane, outpamP->height); + pbm_freearray(inrow, 16); +} +#else /* SSE functions exist */ + +void +pamflip_transformRowsToColumnsPbmSse(struct pam * const inpamP, + struct pam * const outpamP, + struct xformCore const xformCore) { + + /* Nobody is supposed to call this */ + assert(false); +} +#endif diff --git a/editor/pamflip/pamflip_sse.h b/editor/pamflip/pamflip_sse.h new file mode 100644 index 00000000..1e70b765 --- /dev/null +++ b/editor/pamflip/pamflip_sse.h @@ -0,0 +1,12 @@ +#ifndef PAMFLIP_SSE_H_INCLUDED +#define PAMFLIP_SSE_H_INCLUDED + +struct pam; +#include "flip.h" + +void +pamflip_transformRowsToColumnsPbmSse(struct pam * const inpamP, + struct pam * const outpamP, + struct xformCore const xformCore); + +#endif |