From d9cd52e64d7b6b0fd56566de87c826cb6fe3677d Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Thu, 25 Sep 2014 16:49:38 -0400 Subject: tile: optimize memcmp Customize memcmp.c for tile, using similar tricks from memcpy: - replace MERGE macro with dblalign. - replace memcmp_bytes function with revbytes. - use __glibc_likely. - use post-increment addressing. The schedule is still not perfect: the compiler is not hoisting code above the comparison branch, which could save a bundle or two. memcmp speeds up by 30-40% on shorter aligned tests in benchtest, with some tests with unaligned lengths taking a small performance hit. --- sysdeps/tile/memcmp.c | 367 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 367 insertions(+) create mode 100644 sysdeps/tile/memcmp.c (limited to 'sysdeps/tile') diff --git a/sysdeps/tile/memcmp.c b/sysdeps/tile/memcmp.c new file mode 100644 index 0000000000..8d39921734 --- /dev/null +++ b/sysdeps/tile/memcmp.c @@ -0,0 +1,367 @@ +/* Copyright (C) 1991-2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Torbjorn Granlund (tege@sics.se). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#undef __ptr_t +#define __ptr_t void * + +#if defined HAVE_STRING_H || defined _LIBC +# include +#endif + +#undef memcmp + +#ifndef MEMCMP +# define MEMCMP memcmp +#endif + +#ifdef _LIBC + +# include +# include + +# if __BYTE_ORDER == __BIG_ENDIAN +# define WORDS_BIGENDIAN +# endif + +#else /* Not in the GNU C library. */ + +# include + +/* Type to use for aligned memory operations. + This should normally be the biggest type supported by a single load + and store. Must be an unsigned type. */ +# define op_t unsigned long int +# define OPSIZ (sizeof(op_t)) + +/* Threshold value for when to enter the unrolled loops. */ +# define OP_T_THRES 16 + +/* Type to use for unaligned operations. */ +typedef unsigned char byte; + +#endif /* In the GNU C library. */ + +/* Provide the appropriate builtins to shift two registers based on + the alignment of a pointer held in a third register, and to reverse + the bytes in a word. */ +#ifdef __tilegx__ +#define DBLALIGN __insn_dblalign +#define REVBYTES __insn_revbytes +#else +#define DBLALIGN __insn_dword_align +#define REVBYTES __insn_bytex +#endif + +#ifdef WORDS_BIGENDIAN +# define CMP_LT_OR_GT(a, b) ((a) > (b) ? 1 : -1) +#else +# define CMP_LT_OR_GT(a, b) (REVBYTES(a) > REVBYTES(b) ? 1 : -1) +#endif + +/* BE VERY CAREFUL IF YOU CHANGE THIS CODE! */ + +/* The strategy of this memcmp is: + + 1. Compare bytes until one of the block pointers is aligned. + + 2. Compare using memcmp_common_alignment or + memcmp_not_common_alignment, regarding the alignment of the other + block after the initial byte operations. The maximum number of + full words (of type op_t) are compared in this way. + + 3. Compare the few remaining bytes. */ + +static int memcmp_common_alignment (long, long, size_t) __THROW; + +/* memcmp_common_alignment -- Compare blocks at SRCP1 and SRCP2 with LEN `op_t' + objects (not LEN bytes!). Both SRCP1 and SRCP2 should be aligned for + memory operations on `op_t's. */ +static int +memcmp_common_alignment (srcp1, srcp2, len) + long int srcp1; + long int srcp2; + size_t len; +{ + op_t a0, a1; + op_t b0, b1; + + switch (len % 4) + { + default: /* Avoid warning about uninitialized local variables. */ + case 2: + a0 = ((op_t *) srcp1)[0]; + b0 = ((op_t *) srcp2)[0]; + srcp1 += OPSIZ; + srcp2 += OPSIZ; + len += 2; + goto do1; + case 3: + a1 = ((op_t *) srcp1)[0]; + b1 = ((op_t *) srcp2)[0]; + srcp1 += OPSIZ; + srcp2 += OPSIZ; + len += 1; + goto do2; + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return 0; + a0 = ((op_t *) srcp1)[0]; + b0 = ((op_t *) srcp2)[0]; + srcp1 += OPSIZ; + srcp2 += OPSIZ; + goto do3; + case 1: + a1 = ((op_t *) srcp1)[0]; + b1 = ((op_t *) srcp2)[0]; + srcp1 += OPSIZ; + srcp2 += OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + /* Fall through. */ + } + + do + { + a0 = ((op_t *) srcp1)[0]; + b0 = ((op_t *) srcp2)[0]; + srcp1 += OPSIZ; + srcp2 += OPSIZ; + if (__glibc_likely (a1 != b1)) + return CMP_LT_OR_GT (a1, b1); + + do3: + a1 = ((op_t *) srcp1)[0]; + b1 = ((op_t *) srcp2)[0]; + srcp1 += OPSIZ; + srcp2 += OPSIZ; + if (__glibc_likely (a0 != b0)) + return CMP_LT_OR_GT (a0, b0); + + do2: + a0 = ((op_t *) srcp1)[0]; + b0 = ((op_t *) srcp2)[0]; + srcp1 += OPSIZ; + srcp2 += OPSIZ; + if (__glibc_likely (a1 != b1)) + return CMP_LT_OR_GT (a1, b1); + + do1: + a1 = ((op_t *) srcp1)[0]; + b1 = ((op_t *) srcp2)[0]; + srcp1 += OPSIZ; + srcp2 += OPSIZ; + if (__glibc_likely (a0 != b0)) + return CMP_LT_OR_GT (a0, b0); + + len -= 4; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + if (__glibc_likely (a1 != b1)) + return CMP_LT_OR_GT (a1, b1); + return 0; +} + +static int memcmp_not_common_alignment (long, long, size_t) __THROW; + +/* memcmp_not_common_alignment -- Compare blocks at SRCP1 and SRCP2 with LEN + `op_t' objects (not LEN bytes!). SRCP2 should be aligned for memory + operations on `op_t', but SRCP1 *should be unaligned*. */ +static int +memcmp_not_common_alignment (srcp1, srcp2, len) + long int srcp1; + long int srcp2; + size_t len; +{ + void * srcp1i; + op_t a0, a1, a2, a3; + op_t b0, b1, b2, b3; + op_t x; + + /* Calculate how to shift a word read at the memory operation + aligned srcp1 to make it aligned for comparison. */ + + srcp1i = (void *) srcp1; + + /* Make SRCP1 aligned by rounding it down to the beginning of the `op_t' + it points in the middle of. */ + srcp1 &= -OPSIZ; + + switch (len % 4) + { + default: /* Avoid warning about uninitialized local variables. */ + case 2: + a1 = ((op_t *) srcp1)[0]; + a2 = ((op_t *) srcp1)[1]; + b2 = ((op_t *) srcp2)[0]; + srcp1 += 2 * OPSIZ; + srcp2 += 1 * OPSIZ; + len += 2; + goto do1; + case 3: + a0 = ((op_t *) srcp1)[0]; + a1 = ((op_t *) srcp1)[1]; + b1 = ((op_t *) srcp2)[0]; + srcp1 += 2 * OPSIZ; + srcp2 += 1 * OPSIZ; + len += 1; + goto do2; + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return 0; + a3 = ((op_t *) srcp1)[0]; + a0 = ((op_t *) srcp1)[1]; + b0 = ((op_t *) srcp2)[0]; + srcp1 += 2 * OPSIZ; + srcp2 += 1 * OPSIZ; + goto do3; + case 1: + a2 = ((op_t *) srcp1)[0]; + a3 = ((op_t *) srcp1)[1]; + b3 = ((op_t *) srcp2)[0]; + srcp1 += 2 * OPSIZ; + srcp2 += 1 * OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + /* Fall through. */ + } + + do + { + a0 = ((op_t *) srcp1)[0]; + b0 = ((op_t *) srcp2)[0]; + x = DBLALIGN (a2, a3, srcp1i); + srcp1 += OPSIZ; + srcp2 += OPSIZ; + if (__glibc_likely (x != b3)) + return CMP_LT_OR_GT (x, b3); + + do3: + a1 = ((op_t *) srcp1)[0]; + b1 = ((op_t *) srcp2)[0]; + x = DBLALIGN (a3, a0, srcp1i); + srcp1 += OPSIZ; + srcp2 += OPSIZ; + if (__glibc_likely (x != b0)) + return CMP_LT_OR_GT (x, b0); + + do2: + a2 = ((op_t *) srcp1)[0]; + b2 = ((op_t *) srcp2)[0]; + x = DBLALIGN (a0, a1, srcp1i); + srcp1 += OPSIZ; + srcp2 += OPSIZ; + if (__glibc_likely (x != b1)) + return CMP_LT_OR_GT (x, b1); + + do1: + a3 = ((op_t *) srcp1)[0]; + b3 = ((op_t *) srcp2)[0]; + x = DBLALIGN (a1, a2, srcp1i); + srcp1 += OPSIZ; + srcp2 += OPSIZ; + if (__glibc_likely (x != b2)) + return CMP_LT_OR_GT (x, b2); + + len -= 4; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + x = DBLALIGN (a2, a3, srcp1i); + if (__glibc_likely (x != b3)) + return CMP_LT_OR_GT (x, b3); + return 0; +} + +int +MEMCMP (s1, s2, len) + const __ptr_t s1; + const __ptr_t s2; + size_t len; +{ + op_t a0; + op_t b0; + long int srcp1 = (long int) s1; + long int srcp2 = (long int) s2; + int res; + + if (len >= OP_T_THRES) + { + /* There are at least some bytes to compare. No need to test + for LEN == 0 in this alignment loop. */ + while (srcp2 % OPSIZ != 0) + { + a0 = ((byte *) srcp1)[0]; + b0 = ((byte *) srcp2)[0]; + srcp1 += 1; + srcp2 += 1; + res = a0 - b0; + if (__glibc_likely (res != 0)) + return res; + len -= 1; + } + + /* SRCP2 is now aligned for memory operations on `op_t'. + SRCP1 alignment determines if we can do a simple, + aligned compare or need to shuffle bits. */ + + if (srcp1 % OPSIZ == 0) + res = memcmp_common_alignment (srcp1, srcp2, len / OPSIZ); + else + res = memcmp_not_common_alignment (srcp1, srcp2, len / OPSIZ); + if (res != 0) + return res; + + /* Number of bytes remaining in the interval [0..OPSIZ-1]. */ + srcp1 += len & -OPSIZ; + srcp2 += len & -OPSIZ; + len %= OPSIZ; + } + + /* There are just a few bytes to compare. Use byte memory operations. */ + while (len != 0) + { + a0 = ((byte *) srcp1)[0]; + b0 = ((byte *) srcp2)[0]; + srcp1 += 1; + srcp2 += 1; + res = a0 - b0; + if (__glibc_likely (res != 0)) + return res; + len -= 1; + } + + return 0; +} +libc_hidden_builtin_def(memcmp) +#ifdef weak_alias +# undef bcmp +weak_alias (memcmp, bcmp) +#endif -- cgit 1.4.1