diff options
-rw-r--r-- | ChangeLog | 4 | ||||
-rw-r--r-- | sysdeps/mips/strcmp.S | 249 |
2 files changed, 253 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog index 6ea4751067..cd987d9084 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2014-10-01 Steve Ellcey <sellcey@mips.com> + + * sysdeps/mips/strcmp.S: New. + 2014-09-30 Joseph Myers <joseph@codesourcery.com> [BZ #14138] diff --git a/sysdeps/mips/strcmp.S b/sysdeps/mips/strcmp.S new file mode 100644 index 0000000000..e0c82127d9 --- /dev/null +++ b/sysdeps/mips/strcmp.S @@ -0,0 +1,249 @@ +/* Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef ANDROID_CHANGES +# include "machine/asm.h" +# include "machine/regdef.h" +#elif _LIBC +# include <sysdep.h> +# include <regdef.h> +# include <sys/asm.h> +#elif _COMPILING_NEWLIB +# include "machine/asm.h" +# include "machine/regdef.h" +#else +# include <regdef.h> +# include <sys/asm.h> +#endif + +/* Technically strcmp should not read past the end of the strings being + compared. We will read a full word that may contain excess bits beyond + the NULL string terminator but unless ENABLE_READAHEAD is set, we will not + read the next word after the end of string. Setting ENABLE_READAHEAD will + improve performance but is technically illegal based on the definition of + strcmp. */ +#ifdef ENABLE_READAHEAD +# define DELAY_READ +#else +# define DELAY_READ nop +#endif + +/* Testing on a little endian machine showed using CLZ was a + performance loss, so we are not turning it on by default. */ +#if defined(ENABLE_CLZ) && (__mips_isa_rev > 1) +# define USE_CLZ +#endif + +/* Some asm.h files do not have the L macro definition. */ +#ifndef L +# if _MIPS_SIM == _ABIO32 +# define L(label) $L ## label +# else +# define L(label) .L ## label +# endif +#endif + +/* Some asm.h files do not have the PTR_ADDIU macro definition. */ +#ifndef PTR_ADDIU +# ifdef USE_DOUBLE +# define PTR_ADDIU daddiu +# else +# define PTR_ADDIU addiu +# endif +#endif + +/* Allow the routine to be named something else if desired. */ +#ifndef STRCMP_NAME +# define STRCMP_NAME strcmp +#endif + +#ifdef ANDROID_CHANGES +LEAF(STRCMP_NAME, 0) +#else +LEAF(STRCMP_NAME) +#endif + .set nomips16 + .set noreorder + + or t0, a0, a1 + andi t0,0x3 + bne t0, zero, L(byteloop) + +/* Both strings are 4 byte aligned at this point. */ + + lui t8, 0x0101 + ori t8, t8, 0x0101 + lui t9, 0x7f7f + ori t9, 0x7f7f + +#define STRCMP32(OFFSET) \ + lw v0, OFFSET(a0); \ + lw v1, OFFSET(a1); \ + subu t0, v0, t8; \ + bne v0, v1, L(worddiff); \ + nor t1, v0, t9; \ + and t0, t0, t1; \ + bne t0, zero, L(returnzero) + +L(wordloop): + STRCMP32(0) + DELAY_READ + STRCMP32(4) + DELAY_READ + STRCMP32(8) + DELAY_READ + STRCMP32(12) + DELAY_READ + STRCMP32(16) + DELAY_READ + STRCMP32(20) + DELAY_READ + STRCMP32(24) + DELAY_READ + STRCMP32(28) + PTR_ADDIU a0, a0, 32 + b L(wordloop) + PTR_ADDIU a1, a1, 32 + +L(returnzero): + j ra + move v0, zero + +L(worddiff): +#ifdef USE_CLZ + subu t0, v0, t8 + nor t1, v0, t9 + and t1, t0, t1 + xor t0, v0, v1 + or t0, t0, t1 +# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + wsbh t0, t0 + rotr t0, t0, 16 +# endif + clz t1, t0 + and t1, 0xf8 +# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + neg t1 + addu t1, 24 +# endif + rotrv v0, v0, t1 + rotrv v1, v1, t1 + and v0, v0, 0xff + and v1, v1, 0xff + j ra + subu v0, v0, v1 +#else /* USE_CLZ */ +# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + andi t0, v0, 0xff + beq t0, zero, L(wexit01) + andi t1, v1, 0xff + bne t0, t1, L(wexit01) + + srl t8, v0, 8 + srl t9, v1, 8 + andi t8, t8, 0xff + beq t8, zero, L(wexit89) + andi t9, t9, 0xff + bne t8, t9, L(wexit89) + + srl t0, v0, 16 + srl t1, v1, 16 + andi t0, t0, 0xff + beq t0, zero, L(wexit01) + andi t1, t1, 0xff + bne t0, t1, L(wexit01) + + srl t8, v0, 24 + srl t9, v1, 24 +# else /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ + srl t0, v0, 24 + beq t0, zero, L(wexit01) + srl t1, v1, 24 + bne t0, t1, L(wexit01) + + srl t8, v0, 16 + srl t9, v1, 16 + andi t8, t8, 0xff + beq t8, zero, L(wexit89) + andi t9, t9, 0xff + bne t8, t9, L(wexit89) + + srl t0, v0, 8 + srl t1, v1, 8 + andi t0, t0, 0xff + beq t0, zero, L(wexit01) + andi t1, t1, 0xff + bne t0, t1, L(wexit01) + + andi t8, v0, 0xff + andi t9, v1, 0xff +# endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */ + +L(wexit89): + j ra + subu v0, t8, t9 +L(wexit01): + j ra + subu v0, t0, t1 +#endif /* USE_CLZ */ + +/* It might seem better to do the 'beq' instruction between the two 'lbu' + instructions so that the nop is not needed but testing showed that this + code is actually faster (based on glibc strcmp test). */ +#define BYTECMP01(OFFSET) \ + lbu v0, OFFSET(a0); \ + lbu v1, OFFSET(a1); \ + beq v0, zero, L(bexit01); \ + nop; \ + bne v0, v1, L(bexit01) + +#define BYTECMP89(OFFSET) \ + lbu t8, OFFSET(a0); \ + lbu t9, OFFSET(a1); \ + beq t8, zero, L(bexit89); \ + nop; \ + bne t8, t9, L(bexit89) + +L(byteloop): + BYTECMP01(0) + BYTECMP89(1) + BYTECMP01(2) + BYTECMP89(3) + BYTECMP01(4) + BYTECMP89(5) + BYTECMP01(6) + BYTECMP89(7) + PTR_ADDIU a0, a0, 8 + b L(byteloop) + PTR_ADDIU a1, a1, 8 + +L(bexit01): + j ra + subu v0, v0, v1 +L(bexit89): + j ra + subu v0, t8, t9 + + .set at + .set reorder + +END(STRCMP_NAME) +#ifndef ANDROID_CHANGES +# ifdef _LIBC +libc_hidden_builtin_def (STRCMP_NAME) +# endif +#endif |