From bc5a8e94bef09618fcb3e086dc5a42c2ed98e530 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 20 Aug 2015 08:05:39 -0700 Subject: Add i386 memchr multiarch functions --- sysdeps/i386/i686/multiarch/Makefile | 1 - sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S | 496 ------------------ sysdeps/i386/i686/multiarch/memchr-sse2.S | 705 -------------------------- sysdeps/i386/i686/multiarch/memchr.S | 65 --- sysdeps/i386/multiarch/Makefile | 3 +- sysdeps/i386/multiarch/ifunc-impl-list.c | 4 +- sysdeps/i386/multiarch/memchr-sse2-bsf.S | 496 ++++++++++++++++++ sysdeps/i386/multiarch/memchr-sse2.S | 705 ++++++++++++++++++++++++++ sysdeps/i386/multiarch/memchr.S | 65 +++ 9 files changed, 1270 insertions(+), 1270 deletions(-) delete mode 100644 sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S delete mode 100644 sysdeps/i386/i686/multiarch/memchr-sse2.S delete mode 100644 sysdeps/i386/i686/multiarch/memchr.S create mode 100644 sysdeps/i386/multiarch/memchr-sse2-bsf.S create mode 100644 sysdeps/i386/multiarch/memchr-sse2.S create mode 100644 sysdeps/i386/multiarch/memchr.S diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 3b1190eb12..c088a39bea 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -7,7 +7,6 @@ sysdep_routines += strcmp-ssse3 \ strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \ strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \ strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \ - memchr-sse2 memchr-sse2-bsf \ memrchr-sse2 memrchr-sse2-bsf memrchr-c \ rawmemchr-sse2 rawmemchr-sse2-bsf \ strnlen-sse2 strnlen-c \ diff --git a/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S deleted file mode 100644 index a5ae86a7ff..0000000000 --- a/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S +++ /dev/null @@ -1,496 +0,0 @@ -/* Optimized memchr with sse2 - Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) - -# include - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# define PARMS 4 -# define STR1 PARMS -# define STR2 STR1+4 - -# ifndef USE_AS_RAWMEMCHR -# define LEN STR2+4 -# define RETURN POP(%edi); ret; CFI_PUSH(%edi); -# endif - -# ifndef MEMCHR -# define MEMCHR __memchr_sse2_bsf -# endif - - .text -ENTRY (MEMCHR) - - mov STR1(%esp), %ecx - movd STR2(%esp), %xmm1 - -# ifndef USE_AS_RAWMEMCHR - mov LEN(%esp), %edx - test %edx, %edx - jz L(return_null_1) -# endif - mov %ecx, %eax - - punpcklbw %xmm1, %xmm1 - punpcklbw %xmm1, %xmm1 - - and $63, %ecx - pshufd $0, %xmm1, %xmm1 - - cmp $48, %ecx - ja L(crosscache) - - movdqu (%eax), %xmm0 - pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %ecx - test %ecx, %ecx - je L(unaligned_no_match_1) -/* Check which byte is a match. */ - bsf %ecx, %ecx - -# ifndef USE_AS_RAWMEMCHR - sub %ecx, %edx - jbe L(return_null_1) -# endif - add %ecx, %eax - ret - - .p2align 4 -L(unaligned_no_match_1): -# ifndef USE_AS_RAWMEMCHR - sub $16, %edx - jbe L(return_null_1) - PUSH (%edi) - lea 16(%eax), %edi - and $15, %eax - and $-16, %edi - add %eax, %edx -# else - lea 16(%eax), %edx - and $-16, %edx -# endif - jmp L(loop_prolog) - - .p2align 4 -L(return_null_1): - xor %eax, %eax - ret - -# ifndef USE_AS_RAWMEMCHR - CFI_POP (%edi) -# endif - - .p2align 4 -L(crosscache): -/* Handle unaligned string. */ - -# ifndef USE_AS_RAWMEMCHR - PUSH (%edi) - mov %eax, %edi - and $15, %ecx - and $-16, %edi - movdqa (%edi), %xmm0 -# else - mov %eax, %edx - and $15, %ecx - and $-16, %edx - movdqa (%edx), %xmm0 -# endif - pcmpeqb %xmm1, %xmm0 -/* Check if there is a match. */ - pmovmskb %xmm0, %eax -/* Remove the leading bytes. */ - sar %cl, %eax - test %eax, %eax - je L(unaligned_no_match) -/* Check which byte is a match. */ - bsf %eax, %eax - -# ifndef USE_AS_RAWMEMCHR - sub %eax, %edx - jbe L(return_null) - add %edi, %eax - add %ecx, %eax - RETURN -# else - add %edx, %eax - add %ecx, %eax - ret -# endif - - .p2align 4 -L(unaligned_no_match): -# ifndef USE_AS_RAWMEMCHR - sub $16, %edx - add %ecx, %edx - jle L(return_null) - add $16, %edi -# else - add $16, %edx -# endif - - .p2align 4 -/* Loop start on aligned string. */ -L(loop_prolog): -# ifndef USE_AS_RAWMEMCHR - sub $64, %edx - jbe L(exit_loop) - movdqa (%edi), %xmm0 -# else - movdqa (%edx), %xmm0 -# endif - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - -# ifndef USE_AS_RAWMEMCHR - movdqa 16(%edi), %xmm2 -# else - movdqa 16(%edx), %xmm2 -# endif - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - -# ifndef USE_AS_RAWMEMCHR - movdqa 32(%edi), %xmm3 -# else - movdqa 32(%edx), %xmm3 -# endif - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - -# ifndef USE_AS_RAWMEMCHR - movdqa 48(%edi), %xmm4 -# else - movdqa 48(%edx), %xmm4 -# endif - pcmpeqb %xmm1, %xmm4 - -# ifndef USE_AS_RAWMEMCHR - add $64, %edi -# else - add $64, %edx -# endif - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - -# ifndef USE_AS_RAWMEMCHR - test $0x3f, %edi -# else - test $0x3f, %edx -# endif - jz L(align64_loop) - -# ifndef USE_AS_RAWMEMCHR - sub $64, %edx - jbe L(exit_loop) - movdqa (%edi), %xmm0 -# else - movdqa (%edx), %xmm0 -# endif - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - -# ifndef USE_AS_RAWMEMCHR - movdqa 16(%edi), %xmm2 -# else - movdqa 16(%edx), %xmm2 -# endif - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - -# ifndef USE_AS_RAWMEMCHR - movdqa 32(%edi), %xmm3 -# else - movdqa 32(%edx), %xmm3 -# endif - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - -# ifndef USE_AS_RAWMEMCHR - movdqa 48(%edi), %xmm3 -# else - movdqa 48(%edx), %xmm3 -# endif - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - -# ifndef USE_AS_RAWMEMCHR - add $64, %edi -# else - add $64, %edx -# endif - test %eax, %eax - jnz L(matches0) - -# ifndef USE_AS_RAWMEMCHR - mov %edi, %ecx - and $-64, %edi - and $63, %ecx - add %ecx, %edx -# else - and $-64, %edx -# endif - - .p2align 4 -L(align64_loop): -# ifndef USE_AS_RAWMEMCHR - sub $64, %edx - jbe L(exit_loop) - movdqa (%edi), %xmm0 - movdqa 16(%edi), %xmm2 - movdqa 32(%edi), %xmm3 - movdqa 48(%edi), %xmm4 -# else - movdqa (%edx), %xmm0 - movdqa 16(%edx), %xmm2 - movdqa 32(%edx), %xmm3 - movdqa 48(%edx), %xmm4 -# endif - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm0, %xmm3 - pmaxub %xmm2, %xmm4 - pmaxub %xmm3, %xmm4 - pmovmskb %xmm4, %eax - -# ifndef USE_AS_RAWMEMCHR - add $64, %edi -# else - add $64, %edx -# endif - - test %eax, %eax - jz L(align64_loop) - -# ifndef USE_AS_RAWMEMCHR - sub $64, %edi -# else - sub $64, %edx -# endif - - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - -# ifndef USE_AS_RAWMEMCHR - movdqa 32(%edi), %xmm3 -# else - movdqa 32(%edx), %xmm3 -# endif - - pcmpeqb %xmm1, %xmm3 - -# ifndef USE_AS_RAWMEMCHR - pcmpeqb 48(%edi), %xmm1 -# else - pcmpeqb 48(%edx), %xmm1 -# endif - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - pmovmskb %xmm1, %eax - bsf %eax, %eax - -# ifndef USE_AS_RAWMEMCHR - lea 48(%edi, %eax), %eax - RETURN -# else - lea 48(%edx, %eax), %eax - ret -# endif - -# ifndef USE_AS_RAWMEMCHR - .p2align 4 -L(exit_loop): - add $64, %edx - cmp $32, %edx - jbe L(exit_loop_32) - - movdqa (%edi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches) - - movdqa 16(%edi), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - movdqa 32(%edi), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32_1) - cmp $48, %edx - jbe L(return_null) - - pcmpeqb 48(%edi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches48_1) - xor %eax, %eax - RETURN - - .p2align 4 -L(exit_loop_32): - movdqa (%edi), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches_1) - cmp $16, %edx - jbe L(return_null) - - pcmpeqb 16(%edi), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches16_1) - xor %eax, %eax - RETURN -# endif - .p2align 4 -L(matches0): - bsf %eax, %eax -# ifndef USE_AS_RAWMEMCHR - lea -16(%eax, %edi), %eax - RETURN -# else - lea -16(%eax, %edx), %eax - ret -# endif - - .p2align 4 -L(matches): - bsf %eax, %eax -# ifndef USE_AS_RAWMEMCHR - add %edi, %eax - RETURN -# else - add %edx, %eax - ret -# endif - - .p2align 4 -L(matches16): - bsf %eax, %eax -# ifndef USE_AS_RAWMEMCHR - lea 16(%eax, %edi), %eax - RETURN -# else - lea 16(%eax, %edx), %eax - ret -# endif - - .p2align 4 -L(matches32): - bsf %eax, %eax -# ifndef USE_AS_RAWMEMCHR - lea 32(%eax, %edi), %eax - RETURN -# else - lea 32(%eax, %edx), %eax - ret -# endif - -# ifndef USE_AS_RAWMEMCHR - .p2align 4 -L(matches_1): - bsf %eax, %eax - sub %eax, %edx - jbe L(return_null) - - add %edi, %eax - RETURN - - .p2align 4 -L(matches16_1): - sub $16, %edx - bsf %eax, %eax - sub %eax, %edx - jbe L(return_null) - - lea 16(%edi, %eax), %eax - RETURN - - .p2align 4 -L(matches32_1): - sub $32, %edx - bsf %eax, %eax - sub %eax, %edx - jbe L(return_null) - - lea 32(%edi, %eax), %eax - RETURN - - .p2align 4 -L(matches48_1): - sub $48, %edx - bsf %eax, %eax - sub %eax, %edx - jbe L(return_null) - - lea 48(%edi, %eax), %eax - RETURN -# endif - .p2align 4 -L(return_null): - xor %eax, %eax -# ifndef USE_AS_RAWMEMCHR - RETURN -# else - ret -# endif - -END (MEMCHR) -#endif diff --git a/sysdeps/i386/i686/multiarch/memchr-sse2.S b/sysdeps/i386/i686/multiarch/memchr-sse2.S deleted file mode 100644 index ae40677278..0000000000 --- a/sysdeps/i386/i686/multiarch/memchr-sse2.S +++ /dev/null @@ -1,705 +0,0 @@ -/* Optimized memchr with sse2 without bsf - Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) - -# include - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# ifndef USE_AS_RAWMEMCHR -# define ENTRANCE PUSH(%edi); -# define PARMS 8 -# define RETURN POP(%edi); ret; CFI_PUSH(%edi); -# else -# define ENTRANCE -# define PARMS 4 -# endif - -# define STR1 PARMS -# define STR2 STR1+4 - -# ifndef USE_AS_RAWMEMCHR -# define LEN STR2+4 -# endif - -# ifndef MEMCHR -# define MEMCHR __memchr_sse2 -# endif - - atom_text_section -ENTRY (MEMCHR) - ENTRANCE - mov STR1(%esp), %ecx - movd STR2(%esp), %xmm1 -# ifndef USE_AS_RAWMEMCHR - mov LEN(%esp), %edx - test %edx, %edx - jz L(return_null) -# endif - - punpcklbw %xmm1, %xmm1 -# ifndef USE_AS_RAWMEMCHR - mov %ecx, %edi -# else - mov %ecx, %edx -# endif - punpcklbw %xmm1, %xmm1 - - and $63, %ecx - pshufd $0, %xmm1, %xmm1 - cmp $48, %ecx - ja L(crosscache) - -# ifndef USE_AS_RAWMEMCHR - movdqu (%edi), %xmm0 -# else - movdqu (%edx), %xmm0 -# endif - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax -# ifndef USE_AS_RAWMEMCHR - jnz L(match_case2_prolog) - - sub $16, %edx - jbe L(return_null) - lea 16(%edi), %edi - and $15, %ecx - and $-16, %edi - add %ecx, %edx -# else - jnz L(match_case1_prolog) - lea 16(%edx), %edx - and $-16, %edx -# endif - jmp L(loop_prolog) - - .p2align 4 -L(crosscache): - and $15, %ecx -# ifndef USE_AS_RAWMEMCHR - and $-16, %edi - movdqa (%edi), %xmm0 -# else - and $-16, %edx - movdqa (%edx), %xmm0 -# endif - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - sar %cl, %eax - test %eax, %eax - -# ifndef USE_AS_RAWMEMCHR - jnz L(match_case2_prolog1) - lea -16(%edx), %edx - add %ecx, %edx - jle L(return_null) - lea 16(%edi), %edi -# else - jnz L(match_case1_prolog1) - lea 16(%edx), %edx -# endif - - .p2align 4 -L(loop_prolog): -# ifndef USE_AS_RAWMEMCHR - sub $64, %edx - jbe L(exit_loop) - movdqa (%edi), %xmm0 -# else - movdqa (%edx), %xmm0 -# endif - pcmpeqb %xmm1, %xmm0 - xor %ecx, %ecx - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(match_case1) - -# ifndef USE_AS_RAWMEMCHR - movdqa 16(%edi), %xmm2 -# else - movdqa 16(%edx), %xmm2 -# endif - pcmpeqb %xmm1, %xmm2 - lea 16(%ecx), %ecx - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(match_case1) - -# ifndef USE_AS_RAWMEMCHR - movdqa 32(%edi), %xmm3 -# else - movdqa 32(%edx), %xmm3 -# endif - pcmpeqb %xmm1, %xmm3 - lea 16(%ecx), %ecx - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(match_case1) - -# ifndef USE_AS_RAWMEMCHR - movdqa 48(%edi), %xmm4 -# else - movdqa 48(%edx), %xmm4 -# endif - pcmpeqb %xmm1, %xmm4 - lea 16(%ecx), %ecx - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(match_case1) - -# ifndef USE_AS_RAWMEMCHR - lea 64(%edi), %edi - sub $64, %edx - jbe L(exit_loop) - - movdqa (%edi), %xmm0 -# else - lea 64(%edx), %edx - movdqa (%edx), %xmm0 -# endif - pcmpeqb %xmm1, %xmm0 - xor %ecx, %ecx - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(match_case1) - -# ifndef USE_AS_RAWMEMCHR - movdqa 16(%edi), %xmm2 -# else - movdqa 16(%edx), %xmm2 -# endif - pcmpeqb %xmm1, %xmm2 - lea 16(%ecx), %ecx - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(match_case1) - -# ifndef USE_AS_RAWMEMCHR - movdqa 32(%edi), %xmm3 -# else - movdqa 32(%edx), %xmm3 -# endif - pcmpeqb %xmm1, %xmm3 - lea 16(%ecx), %ecx - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(match_case1) - -# ifndef USE_AS_RAWMEMCHR - movdqa 48(%edi), %xmm4 -# else - movdqa 48(%edx), %xmm4 -# endif - pcmpeqb %xmm1, %xmm4 - lea 16(%ecx), %ecx - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(match_case1) - -# ifndef USE_AS_RAWMEMCHR - lea 64(%edi), %edi - mov %edi, %ecx - and $-64, %edi - and $63, %ecx - add %ecx, %edx -# else - lea 64(%edx), %edx - and $-64, %edx -# endif - - .p2align 4 -L(align64_loop): - -# ifndef USE_AS_RAWMEMCHR - sub $64, %edx - jbe L(exit_loop) - movdqa (%edi), %xmm0 - movdqa 16(%edi), %xmm2 - movdqa 32(%edi), %xmm3 - movdqa 48(%edi), %xmm4 -# else - movdqa (%edx), %xmm0 - movdqa 16(%edx), %xmm2 - movdqa 32(%edx), %xmm3 - movdqa 48(%edx), %xmm4 -# endif - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm0, %xmm3 - pmaxub %xmm2, %xmm4 - pmaxub %xmm3, %xmm4 -# ifndef USE_AS_RAWMEMCHR - add $64, %edi -# else - add $64, %edx -# endif - pmovmskb %xmm4, %eax - - test %eax, %eax - jz L(align64_loop) - -# ifndef USE_AS_RAWMEMCHR - sub $64, %edi -# else - sub $64, %edx -# endif - - pmovmskb %xmm0, %eax - xor %ecx, %ecx - test %eax, %eax - jnz L(match_case1) - - pmovmskb %xmm2, %eax - lea 16(%ecx), %ecx - test %eax, %eax - jnz L(match_case1) - -# ifndef USE_AS_RAWMEMCHR - movdqa 32(%edi), %xmm3 -# else - movdqa 32(%edx), %xmm3 -# endif - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - lea 16(%ecx), %ecx - test %eax, %eax - jnz L(match_case1) - -# ifndef USE_AS_RAWMEMCHR - pcmpeqb 48(%edi), %xmm1 -# else - pcmpeqb 48(%edx), %xmm1 -# endif - pmovmskb %xmm1, %eax - lea 16(%ecx), %ecx - - .p2align 4 -L(match_case1): -# ifndef USE_AS_RAWMEMCHR - add %ecx, %edi -# else -L(match_case1_prolog1): - add %ecx, %edx -L(match_case1_prolog): -# endif - test %al, %al - jz L(match_case1_high) - mov %al, %cl - and $15, %cl - jz L(match_case1_8) - test $0x01, %al - jnz L(ExitCase1_1) - test $0x02, %al - jnz L(ExitCase1_2) - test $0x04, %al - jnz L(ExitCase1_3) -# ifndef USE_AS_RAWMEMCHR - lea 3(%edi), %eax - RETURN -# else - lea 3(%edx), %eax - ret -# endif - - .p2align 4 -L(match_case1_8): - test $0x10, %al - jnz L(ExitCase1_5) - test $0x20, %al - jnz L(ExitCase1_6) - test $0x40, %al - jnz L(ExitCase1_7) -# ifndef USE_AS_RAWMEMCHR - lea 7(%edi), %eax - RETURN -# else - lea 7(%edx), %eax - ret -# endif - - .p2align 4 -L(match_case1_high): - mov %ah, %ch - and $15, %ch - jz L(match_case1_high_8) - test $0x01, %ah - jnz L(ExitCase1_9) - test $0x02, %ah - jnz L(ExitCase1_10) - test $0x04, %ah - jnz L(ExitCase1_11) -# ifndef USE_AS_RAWMEMCHR - lea 11(%edi), %eax - RETURN -# else - lea 11(%edx), %eax - ret -# endif - - .p2align 4 -L(match_case1_high_8): - test $0x10, %ah - jnz L(ExitCase1_13) - test $0x20, %ah - jnz L(ExitCase1_14) - test $0x40, %ah - jnz L(ExitCase1_15) -# ifndef USE_AS_RAWMEMCHR - lea 15(%edi), %eax - RETURN -# else - lea 15(%edx), %eax - ret -# endif - -# ifndef USE_AS_RAWMEMCHR - .p2align 4 -L(exit_loop): - add $64, %edx - - movdqa (%edi), %xmm0 - pcmpeqb %xmm1, %xmm0 - xor %ecx, %ecx - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(match_case2) - cmp $16, %edx - jbe L(return_null) - - movdqa 16(%edi), %xmm2 - pcmpeqb %xmm1, %xmm2 - lea 16(%ecx), %ecx - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(match_case2) - cmp $32, %edx - jbe L(return_null) - - movdqa 32(%edi), %xmm3 - pcmpeqb %xmm1, %xmm3 - lea 16(%ecx), %ecx - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(match_case2) - cmp $48, %edx - jbe L(return_null) - - pcmpeqb 48(%edi), %xmm1 - lea 16(%ecx), %ecx - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(match_case2) - - xor %eax, %eax - RETURN -# endif - - .p2align 4 -L(ExitCase1_1): -# ifndef USE_AS_RAWMEMCHR - mov %edi, %eax - RETURN -# else - mov %edx, %eax - ret -# endif - - .p2align 4 -L(ExitCase1_2): -# ifndef USE_AS_RAWMEMCHR - lea 1(%edi), %eax - RETURN -# else - lea 1(%edx), %eax - ret -# endif - - .p2align 4 -L(ExitCase1_3): -# ifndef USE_AS_RAWMEMCHR - lea 2(%edi), %eax - RETURN -# else - lea 2(%edx), %eax - ret -# endif - - .p2align 4 -L(ExitCase1_5): -# ifndef USE_AS_RAWMEMCHR - lea 4(%edi), %eax - RETURN -# else - lea 4(%edx), %eax - ret -# endif - - .p2align 4 -L(ExitCase1_6): -# ifndef USE_AS_RAWMEMCHR - lea 5(%edi), %eax - RETURN -# else - lea 5(%edx), %eax - ret -# endif - - .p2align 4 -L(ExitCase1_7): -# ifndef USE_AS_RAWMEMCHR - lea 6(%edi), %eax - RETURN -# else - lea 6(%edx), %eax - ret -# endif - - .p2align 4 -L(ExitCase1_9): -# ifndef USE_AS_RAWMEMCHR - lea 8(%edi), %eax - RETURN -# else - lea 8(%edx), %eax - ret -# endif - - .p2align 4 -L(ExitCase1_10): -# ifndef USE_AS_RAWMEMCHR - lea 9(%edi), %eax - RETURN -# else - lea 9(%edx), %eax - ret -# endif - - .p2align 4 -L(ExitCase1_11): -# ifndef USE_AS_RAWMEMCHR - lea 10(%edi), %eax - RETURN -# else - lea 10(%edx), %eax - ret -# endif - - .p2align 4 -L(ExitCase1_13): -# ifndef USE_AS_RAWMEMCHR - lea 12(%edi), %eax - RETURN -# else - lea 12(%edx), %eax - ret -# endif - - .p2align 4 -L(ExitCase1_14): -# ifndef USE_AS_RAWMEMCHR - lea 13(%edi), %eax - RETURN -# else - lea 13(%edx), %eax - ret -# endif - - .p2align 4 -L(ExitCase1_15): -# ifndef USE_AS_RAWMEMCHR - lea 14(%edi), %eax - RETURN -# else - lea 14(%edx), %eax - ret -# endif - -# ifndef USE_AS_RAWMEMCHR - .p2align 4 -L(match_case2): - sub %ecx, %edx -L(match_case2_prolog1): - add %ecx, %edi -L(match_case2_prolog): - test %al, %al - jz L(match_case2_high) - mov %al, %cl - and $15, %cl - jz L(match_case2_8) - test $0x01, %al - jnz L(ExitCase2_1) - test $0x02, %al - jnz L(ExitCase2_2) - test $0x04, %al - jnz L(ExitCase2_3) - sub $4, %edx - jb L(return_null) - lea 3(%edi), %eax - RETURN - - .p2align 4 -L(match_case2_8): - test $0x10, %al - jnz L(ExitCase2_5) - test $0x20, %al - jnz L(ExitCase2_6) - test $0x40, %al - jnz L(ExitCase2_7) - sub $8, %edx - jb L(return_null) - lea 7(%edi), %eax - RETURN - - .p2align 4 -L(match_case2_high): - mov %ah, %ch - and $15, %ch - jz L(match_case2_high_8) - test $0x01, %ah - jnz L(ExitCase2_9) - test $0x02, %ah - jnz L(ExitCase2_10) - test $0x04, %ah - jnz L(ExitCase2_11) - sub $12, %edx - jb L(return_null) - lea 11(%edi), %eax - RETURN - - .p2align 4 -L(match_case2_high_8): - test $0x10, %ah - jnz L(ExitCase2_13) - test $0x20, %ah - jnz L(ExitCase2_14) - test $0x40, %ah - jnz L(ExitCase2_15) - sub $16, %edx - jb L(return_null) - lea 15(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_1): - mov %edi, %eax - RETURN - - .p2align 4 -L(ExitCase2_2): - sub $2, %edx - jb L(return_null) - lea 1(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_3): - sub $3, %edx - jb L(return_null) - lea 2(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_5): - sub $5, %edx - jb L(return_null) - lea 4(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_6): - sub $6, %edx - jb L(return_null) - lea 5(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_7): - sub $7, %edx - jb L(return_null) - lea 6(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_9): - sub $9, %edx - jb L(return_null) - lea 8(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_10): - sub $10, %edx - jb L(return_null) - lea 9(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_11): - sub $11, %edx - jb L(return_null) - lea 10(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_13): - sub $13, %edx - jb L(return_null) - lea 12(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_14): - sub $14, %edx - jb L(return_null) - lea 13(%edi), %eax - RETURN - - .p2align 4 -L(ExitCase2_15): - sub $15, %edx - jb L(return_null) - lea 14(%edi), %eax - RETURN -# endif - - .p2align 4 -L(return_null): - xor %eax, %eax -# ifndef USE_AS_RAWMEMCHR - RETURN -# else - ret -# endif - -END (MEMCHR) -#endif diff --git a/sysdeps/i386/i686/multiarch/memchr.S b/sysdeps/i386/i686/multiarch/memchr.S deleted file mode 100644 index 65e6b96f3b..0000000000 --- a/sysdeps/i386/i686/multiarch/memchr.S +++ /dev/null @@ -1,65 +0,0 @@ -/* Multiple versions of memchr - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -#if IS_IN (libc) - .text -ENTRY(__memchr) - .type __memchr, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - HAS_CPU_FEATURE (SSE2) - jz 2f - HAS_ARCH_FEATURE (Slow_BSF) - jz 3f - - LOAD_FUNC_GOT_EAX ( __memchr_sse2) - ret - -2: LOAD_FUNC_GOT_EAX (__memchr_ia32) - ret - -3: LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf) - ret -END(__memchr) - -weak_alias(__memchr, memchr) - -# undef ENTRY -# define ENTRY(name) \ - .type __memchr_ia32, @function; \ - .globl __memchr_ia32; \ - .p2align 4; \ - __memchr_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __memchr_ia32, .-__memchr_ia32 - -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memchr; __GI_memchr = __memchr_ia32 - -#endif -#include "../../memchr.S" diff --git a/sysdeps/i386/multiarch/Makefile b/sysdeps/i386/multiarch/Makefile index 58dffcd78e..937cea9414 100644 --- a/sysdeps/i386/multiarch/Makefile +++ b/sysdeps/i386/multiarch/Makefile @@ -17,5 +17,6 @@ sysdep_routines += bcopy-i386 bcopy-i686 bcopy-sse2-unaligned \ bzero-i386 bzero-i586 bzero-i686 \ bzero-sse2 bzero-sse2-rep \ memset-i386 memset-i586 memset-i686 \ - memset-sse2 memset-sse2-rep + memset-sse2 memset-sse2-rep \ + memchr-sse2-bsf memchr-sse2 endif diff --git a/sysdeps/i386/multiarch/ifunc-impl-list.c b/sysdeps/i386/multiarch/ifunc-impl-list.c index d5df5de426..420efb2874 100644 --- a/sysdeps/i386/multiarch/ifunc-impl-list.c +++ b/sysdeps/i386/multiarch/ifunc-impl-list.c @@ -63,15 +63,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, #endif ) -#if 0 /* Support sysdeps/i386/i686/multiarch/memchr.S. */ IFUNC_IMPL (i, name, memchr, IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2), __memchr_sse2_bsf) IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2), __memchr_sse2) - IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32)) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_i386)) +#if 0 /* Support sysdeps/i386/i686/multiarch/memcmp.S. */ IFUNC_IMPL (i, name, memcmp, IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2), diff --git a/sysdeps/i386/multiarch/memchr-sse2-bsf.S b/sysdeps/i386/multiarch/memchr-sse2-bsf.S new file mode 100644 index 0000000000..a5ae86a7ff --- /dev/null +++ b/sysdeps/i386/multiarch/memchr-sse2-bsf.S @@ -0,0 +1,496 @@ +/* Optimized memchr with sse2 + Copyright (C) 2011-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + +# ifndef USE_AS_RAWMEMCHR +# define LEN STR2+4 +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); +# endif + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2_bsf +# endif + + .text +ENTRY (MEMCHR) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + +# ifndef USE_AS_RAWMEMCHR + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null_1) +# endif + mov %ecx, %eax + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %ecx + ja L(crosscache) + + movdqu (%eax), %xmm0 + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %ecx + test %ecx, %ecx + je L(unaligned_no_match_1) +/* Check which byte is a match. */ + bsf %ecx, %ecx + +# ifndef USE_AS_RAWMEMCHR + sub %ecx, %edx + jbe L(return_null_1) +# endif + add %ecx, %eax + ret + + .p2align 4 +L(unaligned_no_match_1): +# ifndef USE_AS_RAWMEMCHR + sub $16, %edx + jbe L(return_null_1) + PUSH (%edi) + lea 16(%eax), %edi + and $15, %eax + and $-16, %edi + add %eax, %edx +# else + lea 16(%eax), %edx + and $-16, %edx +# endif + jmp L(loop_prolog) + + .p2align 4 +L(return_null_1): + xor %eax, %eax + ret + +# ifndef USE_AS_RAWMEMCHR + CFI_POP (%edi) +# endif + + .p2align 4 +L(crosscache): +/* Handle unaligned string. */ + +# ifndef USE_AS_RAWMEMCHR + PUSH (%edi) + mov %eax, %edi + and $15, %ecx + and $-16, %edi + movdqa (%edi), %xmm0 +# else + mov %eax, %edx + and $15, %ecx + and $-16, %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + sub %eax, %edx + jbe L(return_null) + add %edi, %eax + add %ecx, %eax + RETURN +# else + add %edx, %eax + add %ecx, %eax + ret +# endif + + .p2align 4 +L(unaligned_no_match): +# ifndef USE_AS_RAWMEMCHR + sub $16, %edx + add %ecx, %edx + jle L(return_null) + add $16, %edi +# else + add $16, %edx +# endif + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + +# ifndef USE_AS_RAWMEMCHR + test $0x3f, %edi +# else + test $0x3f, %edx +# endif + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm3 +# else + movdqa 48(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + test %eax, %eax + jnz L(matches0) + +# ifndef USE_AS_RAWMEMCHR + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx +# else + and $-64, %edx +# endif + + .p2align 4 +L(align64_loop): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 +# else + movdqa (%edx), %xmm0 + movdqa 16(%edx), %xmm2 + movdqa 32(%edx), %xmm3 + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + + test %eax, %eax + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edi +# else + sub $64, %edx +# endif + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + + pcmpeqb %xmm1, %xmm3 + +# ifndef USE_AS_RAWMEMCHR + pcmpeqb 48(%edi), %xmm1 +# else + pcmpeqb 48(%edx), %xmm1 +# endif + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + lea 48(%edi, %eax), %eax + RETURN +# else + lea 48(%edx, %eax), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %eax, %eax + RETURN + + .p2align 4 +L(exit_loop_32): + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 16(%edi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %eax, %eax + RETURN +# endif + .p2align 4 +L(matches0): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea -16(%eax, %edi), %eax + RETURN +# else + lea -16(%eax, %edx), %eax + ret +# endif + + .p2align 4 +L(matches): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + add %edi, %eax + RETURN +# else + add %edx, %eax + ret +# endif + + .p2align 4 +L(matches16): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea 16(%eax, %edi), %eax + RETURN +# else + lea 16(%eax, %edx), %eax + ret +# endif + + .p2align 4 +L(matches32): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea 32(%eax, %edi), %eax + RETURN +# else + lea 32(%eax, %edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(matches_1): + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + add %edi, %eax + RETURN + + .p2align 4 +L(matches16_1): + sub $16, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 16(%edi, %eax), %eax + RETURN + + .p2align 4 +L(matches32_1): + sub $32, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 32(%edi, %eax), %eax + RETURN + + .p2align 4 +L(matches48_1): + sub $48, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 48(%edi, %eax), %eax + RETURN +# endif + .p2align 4 +L(return_null): + xor %eax, %eax +# ifndef USE_AS_RAWMEMCHR + RETURN +# else + ret +# endif + +END (MEMCHR) +#endif diff --git a/sysdeps/i386/multiarch/memchr-sse2.S b/sysdeps/i386/multiarch/memchr-sse2.S new file mode 100644 index 0000000000..ae40677278 --- /dev/null +++ b/sysdeps/i386/multiarch/memchr-sse2.S @@ -0,0 +1,705 @@ +/* Optimized memchr with sse2 without bsf + Copyright (C) 2011-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef USE_AS_RAWMEMCHR +# define ENTRANCE PUSH(%edi); +# define PARMS 8 +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); +# else +# define ENTRANCE +# define PARMS 4 +# endif + +# define STR1 PARMS +# define STR2 STR1+4 + +# ifndef USE_AS_RAWMEMCHR +# define LEN STR2+4 +# endif + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2 +# endif + + atom_text_section +ENTRY (MEMCHR) + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 +# ifndef USE_AS_RAWMEMCHR + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null) +# endif + + punpcklbw %xmm1, %xmm1 +# ifndef USE_AS_RAWMEMCHR + mov %ecx, %edi +# else + mov %ecx, %edx +# endif + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + cmp $48, %ecx + ja L(crosscache) + +# ifndef USE_AS_RAWMEMCHR + movdqu (%edi), %xmm0 +# else + movdqu (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax +# ifndef USE_AS_RAWMEMCHR + jnz L(match_case2_prolog) + + sub $16, %edx + jbe L(return_null) + lea 16(%edi), %edi + and $15, %ecx + and $-16, %edi + add %ecx, %edx +# else + jnz L(match_case1_prolog) + lea 16(%edx), %edx + and $-16, %edx +# endif + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx +# ifndef USE_AS_RAWMEMCHR + and $-16, %edi + movdqa (%edi), %xmm0 +# else + and $-16, %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + sar %cl, %eax + test %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + jnz L(match_case2_prolog1) + lea -16(%edx), %edx + add %ecx, %edx + jle L(return_null) + lea 16(%edi), %edi +# else + jnz L(match_case1_prolog1) + lea 16(%edx), %edx +# endif + + .p2align 4 +L(loop_prolog): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + lea 64(%edi), %edi + sub $64, %edx + jbe L(exit_loop) + + movdqa (%edi), %xmm0 +# else + lea 64(%edx), %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + lea 64(%edi), %edi + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx +# else + lea 64(%edx), %edx + and $-64, %edx +# endif + + .p2align 4 +L(align64_loop): + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 +# else + movdqa (%edx), %xmm0 + movdqa 16(%edx), %xmm2 + movdqa 32(%edx), %xmm3 + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + pmovmskb %xmm4, %eax + + test %eax, %eax + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edi +# else + sub $64, %edx +# endif + + pmovmskb %xmm0, %eax + xor %ecx, %ecx + test %eax, %eax + jnz L(match_case1) + + pmovmskb %xmm2, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + pcmpeqb 48(%edi), %xmm1 +# else + pcmpeqb 48(%edx), %xmm1 +# endif + pmovmskb %xmm1, %eax + lea 16(%ecx), %ecx + + .p2align 4 +L(match_case1): +# ifndef USE_AS_RAWMEMCHR + add %ecx, %edi +# else +L(match_case1_prolog1): + add %ecx, %edx +L(match_case1_prolog): +# endif + test %al, %al + jz L(match_case1_high) + mov %al, %cl + and $15, %cl + jz L(match_case1_8) + test $0x01, %al + jnz L(ExitCase1_1) + test $0x02, %al + jnz L(ExitCase1_2) + test $0x04, %al + jnz L(ExitCase1_3) +# ifndef USE_AS_RAWMEMCHR + lea 3(%edi), %eax + RETURN +# else + lea 3(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_8): + test $0x10, %al + jnz L(ExitCase1_5) + test $0x20, %al + jnz L(ExitCase1_6) + test $0x40, %al + jnz L(ExitCase1_7) +# ifndef USE_AS_RAWMEMCHR + lea 7(%edi), %eax + RETURN +# else + lea 7(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_high): + mov %ah, %ch + and $15, %ch + jz L(match_case1_high_8) + test $0x01, %ah + jnz L(ExitCase1_9) + test $0x02, %ah + jnz L(ExitCase1_10) + test $0x04, %ah + jnz L(ExitCase1_11) +# ifndef USE_AS_RAWMEMCHR + lea 11(%edi), %eax + RETURN +# else + lea 11(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_high_8): + test $0x10, %ah + jnz L(ExitCase1_13) + test $0x20, %ah + jnz L(ExitCase1_14) + test $0x40, %ah + jnz L(ExitCase1_15) +# ifndef USE_AS_RAWMEMCHR + lea 15(%edi), %eax + RETURN +# else + lea 15(%edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(exit_loop): + add $64, %edx + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case2) + cmp $16, %edx + jbe L(return_null) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case2) + cmp $32, %edx + jbe L(return_null) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case2) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + lea 16(%ecx), %ecx + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(match_case2) + + xor %eax, %eax + RETURN +# endif + + .p2align 4 +L(ExitCase1_1): +# ifndef USE_AS_RAWMEMCHR + mov %edi, %eax + RETURN +# else + mov %edx, %eax + ret +# endif + + .p2align 4 +L(ExitCase1_2): +# ifndef USE_AS_RAWMEMCHR + lea 1(%edi), %eax + RETURN +# else + lea 1(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_3): +# ifndef USE_AS_RAWMEMCHR + lea 2(%edi), %eax + RETURN +# else + lea 2(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_5): +# ifndef USE_AS_RAWMEMCHR + lea 4(%edi), %eax + RETURN +# else + lea 4(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_6): +# ifndef USE_AS_RAWMEMCHR + lea 5(%edi), %eax + RETURN +# else + lea 5(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_7): +# ifndef USE_AS_RAWMEMCHR + lea 6(%edi), %eax + RETURN +# else + lea 6(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_9): +# ifndef USE_AS_RAWMEMCHR + lea 8(%edi), %eax + RETURN +# else + lea 8(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_10): +# ifndef USE_AS_RAWMEMCHR + lea 9(%edi), %eax + RETURN +# else + lea 9(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_11): +# ifndef USE_AS_RAWMEMCHR + lea 10(%edi), %eax + RETURN +# else + lea 10(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_13): +# ifndef USE_AS_RAWMEMCHR + lea 12(%edi), %eax + RETURN +# else + lea 12(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_14): +# ifndef USE_AS_RAWMEMCHR + lea 13(%edi), %eax + RETURN +# else + lea 13(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_15): +# ifndef USE_AS_RAWMEMCHR + lea 14(%edi), %eax + RETURN +# else + lea 14(%edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(match_case2): + sub %ecx, %edx +L(match_case2_prolog1): + add %ecx, %edi +L(match_case2_prolog): + test %al, %al + jz L(match_case2_high) + mov %al, %cl + and $15, %cl + jz L(match_case2_8) + test $0x01, %al + jnz L(ExitCase2_1) + test $0x02, %al + jnz L(ExitCase2_2) + test $0x04, %al + jnz L(ExitCase2_3) + sub $4, %edx + jb L(return_null) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_8): + test $0x10, %al + jnz L(ExitCase2_5) + test $0x20, %al + jnz L(ExitCase2_6) + test $0x40, %al + jnz L(ExitCase2_7) + sub $8, %edx + jb L(return_null) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high): + mov %ah, %ch + and $15, %ch + jz L(match_case2_high_8) + test $0x01, %ah + jnz L(ExitCase2_9) + test $0x02, %ah + jnz L(ExitCase2_10) + test $0x04, %ah + jnz L(ExitCase2_11) + sub $12, %edx + jb L(return_null) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high_8): + test $0x10, %ah + jnz L(ExitCase2_13) + test $0x20, %ah + jnz L(ExitCase2_14) + test $0x40, %ah + jnz L(ExitCase2_15) + sub $16, %edx + jb L(return_null) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_1): + mov %edi, %eax + RETURN + + .p2align 4 +L(ExitCase2_2): + sub $2, %edx + jb L(return_null) + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_3): + sub $3, %edx + jb L(return_null) + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_5): + sub $5, %edx + jb L(return_null) + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_6): + sub $6, %edx + jb L(return_null) + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_7): + sub $7, %edx + jb L(return_null) + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_9): + sub $9, %edx + jb L(return_null) + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_10): + sub $10, %edx + jb L(return_null) + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_11): + sub $11, %edx + jb L(return_null) + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_13): + sub $13, %edx + jb L(return_null) + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_14): + sub $14, %edx + jb L(return_null) + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_15): + sub $15, %edx + jb L(return_null) + lea 14(%edi), %eax + RETURN +# endif + + .p2align 4 +L(return_null): + xor %eax, %eax +# ifndef USE_AS_RAWMEMCHR + RETURN +# else + ret +# endif + +END (MEMCHR) +#endif diff --git a/sysdeps/i386/multiarch/memchr.S b/sysdeps/i386/multiarch/memchr.S new file mode 100644 index 0000000000..4b2b941418 --- /dev/null +++ b/sysdeps/i386/multiarch/memchr.S @@ -0,0 +1,65 @@ +/* Multiple versions of memchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) +# include +# include + + .text +ENTRY(__memchr) + .type __memchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX ( __memchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__memchr_i386) + ret + +3: LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf) + ret +END(__memchr) + +weak_alias(__memchr, memchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __memchr_i386, @function; \ + .globl __memchr_i386; \ + .p2align 4; \ + __memchr_i386: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memchr_i386, .-__memchr_i386 + +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memchr; __GI_memchr = __memchr_i386 + +#endif +#include -- cgit 1.4.1