From 3882e4ba5b6a043a034b20b6001dda727cc45a80 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 20 Aug 2015 08:28:10 -0700 Subject: Add i386 memrchr multiarch functions --- sysdeps/i386/i686/multiarch/Makefile | 1 - sysdeps/i386/i686/multiarch/memrchr-c.c | 7 - sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S | 417 -------------- sysdeps/i386/i686/multiarch/memrchr-sse2.S | 724 ------------------------- sysdeps/i386/i686/multiarch/memrchr.S | 45 -- sysdeps/i386/multiarch/Makefile | 3 +- sysdeps/i386/multiarch/ifunc-impl-list.c | 4 +- sysdeps/i386/multiarch/memrchr-i386.c | 7 + sysdeps/i386/multiarch/memrchr-sse2-bsf.S | 417 ++++++++++++++ sysdeps/i386/multiarch/memrchr-sse2.S | 724 +++++++++++++++++++++++++ sysdeps/i386/multiarch/memrchr.S | 45 ++ 11 files changed, 1196 insertions(+), 1198 deletions(-) delete mode 100644 sysdeps/i386/i686/multiarch/memrchr-c.c delete mode 100644 sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S delete mode 100644 sysdeps/i386/i686/multiarch/memrchr-sse2.S delete mode 100644 sysdeps/i386/i686/multiarch/memrchr.S create mode 100644 sysdeps/i386/multiarch/memrchr-i386.c create mode 100644 sysdeps/i386/multiarch/memrchr-sse2-bsf.S create mode 100644 sysdeps/i386/multiarch/memrchr-sse2.S create mode 100644 sysdeps/i386/multiarch/memrchr.S diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index ec128d7047..0b91d00a0d 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -7,7 +7,6 @@ sysdep_routines += strcmp-ssse3 \ strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \ strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \ strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \ - memrchr-sse2 memrchr-sse2-bsf memrchr-c \ rawmemchr-sse2 rawmemchr-sse2-bsf \ strnlen-sse2 strnlen-c \ strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \ diff --git a/sysdeps/i386/i686/multiarch/memrchr-c.c b/sysdeps/i386/i686/multiarch/memrchr-c.c deleted file mode 100644 index ef7bbbe792..0000000000 --- a/sysdeps/i386/i686/multiarch/memrchr-c.c +++ /dev/null @@ -1,7 +0,0 @@ -#if IS_IN (libc) -# define MEMRCHR __memrchr_ia32 -# include -extern void *__memrchr_ia32 (const void *, int, size_t); -#endif - -#include "string/memrchr.c" diff --git a/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S deleted file mode 100644 index 043e1bbd23..0000000000 --- a/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S +++ /dev/null @@ -1,417 +0,0 @@ -/* Optimized memrchr with sse2 - Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) - -# include - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# define PARMS 4 -# define STR1 PARMS -# define STR2 STR1+4 -# define LEN STR2+4 - -# define MEMCHR __memrchr_sse2_bsf - - .text -ENTRY (MEMCHR) - mov STR1(%esp), %ecx - movd STR2(%esp), %xmm1 - mov LEN(%esp), %edx - - sub $16, %edx - jbe L(length_less16) - - punpcklbw %xmm1, %xmm1 - add %edx, %ecx - punpcklbw %xmm1, %xmm1 - - movdqu (%ecx), %xmm0 - pshufd $0, %xmm1, %xmm1 - pcmpeqb %xmm1, %xmm0 - -/* Check if there is a match. */ - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches0) - - sub $64, %ecx - mov %ecx, %eax - and $15, %eax - jz L(loop_prolog) - - add $16, %ecx - add $16, %edx - sub %eax, %ecx - sub %eax, %edx - - .p2align 4 -/* Loop start on aligned string. */ -L(loop_prolog): - sub $64, %edx - jbe L(exit_loop) - - movdqa 48(%ecx), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%ecx), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%ecx), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16) - - movdqa (%ecx), %xmm4 - pcmpeqb %xmm1, %xmm4 - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches0) - - sub $64, %ecx - sub $64, %edx - jbe L(exit_loop) - - movdqa 48(%ecx), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%ecx), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%ecx), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16) - - movdqa (%ecx), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches0) - - mov %ecx, %eax - and $63, %eax - test %eax, %eax - jz L(align64_loop) - - add $64, %ecx - add $64, %edx - sub %eax, %ecx - sub %eax, %edx - - .p2align 4 -L(align64_loop): - sub $64, %ecx - sub $64, %edx - jbe L(exit_loop) - - movdqa (%ecx), %xmm0 - movdqa 16(%ecx), %xmm2 - movdqa 32(%ecx), %xmm3 - movdqa 48(%ecx), %xmm4 - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm3, %xmm0 - pmaxub %xmm4, %xmm2 - pmaxub %xmm0, %xmm2 - pmovmskb %xmm2, %eax - - test %eax, %eax - jz L(align64_loop) - - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches48) - - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%ecx), %xmm2 - - pcmpeqb %xmm1, %xmm2 - pcmpeqb (%ecx), %xmm1 - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - pmovmskb %xmm1, %eax - bsr %eax, %eax - - add %ecx, %eax - ret - - .p2align 4 -L(exit_loop): - add $64, %edx - cmp $32, %edx - jbe L(exit_loop_32) - - movdqa 48(%ecx), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%ecx), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%ecx), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16_1) - cmp $48, %edx - jbe L(return_null) - - pcmpeqb (%ecx), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches0_1) - xor %eax, %eax - ret - - .p2align 4 -L(exit_loop_32): - movdqa 48(%ecx), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48_1) - cmp $16, %edx - jbe L(return_null) - - pcmpeqb 32(%ecx), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches32_1) - xor %eax, %eax - ret - - .p2align 4 -L(matches0): - bsr %eax, %eax - add %ecx, %eax - ret - - .p2align 4 -L(matches16): - bsr %eax, %eax - lea 16(%eax, %ecx), %eax - ret - - .p2align 4 -L(matches32): - bsr %eax, %eax - lea 32(%eax, %ecx), %eax - ret - - .p2align 4 -L(matches48): - bsr %eax, %eax - lea 48(%eax, %ecx), %eax - ret - - .p2align 4 -L(matches0_1): - bsr %eax, %eax - sub $64, %edx - add %eax, %edx - jl L(return_null) - add %ecx, %eax - ret - - .p2align 4 -L(matches16_1): - bsr %eax, %eax - sub $48, %edx - add %eax, %edx - jl L(return_null) - lea 16(%ecx, %eax), %eax - ret - - .p2align 4 -L(matches32_1): - bsr %eax, %eax - sub $32, %edx - add %eax, %edx - jl L(return_null) - lea 32(%ecx, %eax), %eax - ret - - .p2align 4 -L(matches48_1): - bsr %eax, %eax - sub $16, %edx - add %eax, %edx - jl L(return_null) - lea 48(%ecx, %eax), %eax - ret - - .p2align 4 -L(return_null): - xor %eax, %eax - ret - - .p2align 4 -L(length_less16_offset0): - mov %dl, %cl - pcmpeqb (%eax), %xmm1 - - mov $1, %edx - sal %cl, %edx - sub $1, %edx - mov %edx, %ecx - - pmovmskb %xmm1, %edx - - and %ecx, %edx - test %edx, %edx - jz L(return_null) - - bsr %edx, %ecx - add %ecx, %eax - ret - - .p2align 4 -L(length_less16): - punpcklbw %xmm1, %xmm1 - mov %ecx, %eax - punpcklbw %xmm1, %xmm1 - add $16, %edx - jz L(return_null) - - pshufd $0, %xmm1, %xmm1 - and $15, %ecx - jz L(length_less16_offset0) - - PUSH (%edi) - mov %cl, %dh - add %dl, %dh - and $-16, %eax - - sub $16, %dh - ja L(length_less16_part2) - - pcmpeqb (%eax), %xmm1 - pmovmskb %xmm1, %edi - - sar %cl, %edi - add %ecx, %eax - mov %dl, %cl - - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - and %edx, %edi - test %edi, %edi - jz L(ret_null) - - bsr %edi, %edi - add %edi, %eax - POP (%edi) - ret - - CFI_PUSH (%edi) - - .p2align 4 -L(length_less16_part2): - movdqa 16(%eax), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %edi - - mov %cl, %ch - - mov %dh, %cl - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - and %edx, %edi - - test %edi, %edi - jnz L(length_less16_part2_return) - - pcmpeqb (%eax), %xmm1 - pmovmskb %xmm1, %edi - - mov %ch, %cl - sar %cl, %edi - test %edi, %edi - jz L(ret_null) - - bsr %edi, %edi - add %edi, %eax - xor %ch, %ch - add %ecx, %eax - POP (%edi) - ret - - CFI_PUSH (%edi) - - .p2align 4 -L(length_less16_part2_return): - bsr %edi, %edi - lea 16(%eax, %edi), %eax - POP (%edi) - ret - - CFI_PUSH (%edi) - - .p2align 4 -L(ret_null): - xor %eax, %eax - POP (%edi) - ret - -END (MEMCHR) -#endif diff --git a/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/sysdeps/i386/i686/multiarch/memrchr-sse2.S deleted file mode 100644 index 65e3c8bc73..0000000000 --- a/sysdeps/i386/i686/multiarch/memrchr-sse2.S +++ /dev/null @@ -1,724 +0,0 @@ -/* Optimized memrchr with sse2 without bsf - Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) - -# include -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# define PARMS 4 -# define STR1 PARMS -# define STR2 STR1+4 -# define LEN STR2+4 - - atom_text_section -ENTRY (__memrchr_sse2) - mov STR1(%esp), %ecx - movd STR2(%esp), %xmm1 - mov LEN(%esp), %edx - - sub $16, %edx - jbe L(length_less16) - - punpcklbw %xmm1, %xmm1 - add %edx, %ecx - punpcklbw %xmm1, %xmm1 - - movdqu (%ecx), %xmm0 - pshufd $0, %xmm1, %xmm1 - pcmpeqb %xmm1, %xmm0 - - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(exit_dispatch) - - sub $64, %ecx - mov %ecx, %eax - and $15, %eax - jz L(loop_prolog) - - lea 16(%ecx), %ecx - lea 16(%edx), %edx - sub %eax, %edx - and $-16, %ecx - - .p2align 4 -/* Loop start on aligned string. */ -L(loop_prolog): - sub $64, %edx - jbe L(exit_loop) - - movdqa 48(%ecx), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%ecx), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%ecx), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16) - - movdqa (%ecx), %xmm4 - pcmpeqb %xmm1, %xmm4 - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(exit_dispatch) - - sub $64, %ecx - sub $64, %edx - jbe L(exit_loop) - - movdqa 48(%ecx), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%ecx), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%ecx), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16) - - movdqa (%ecx), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(exit_dispatch) - - mov %ecx, %eax - and $63, %eax - test %eax, %eax - jz L(align64_loop) - - lea 64(%ecx), %ecx - lea 64(%edx), %edx - and $-64, %ecx - sub %eax, %edx - - .p2align 4 -L(align64_loop): - sub $64, %ecx - sub $64, %edx - jbe L(exit_loop) - - movdqa (%ecx), %xmm0 - movdqa 16(%ecx), %xmm2 - movdqa 32(%ecx), %xmm3 - movdqa 48(%ecx), %xmm4 - - pcmpeqb %xmm1, %xmm0 - pcmpeqb %xmm1, %xmm2 - pcmpeqb %xmm1, %xmm3 - pcmpeqb %xmm1, %xmm4 - - pmaxub %xmm3, %xmm0 - pmaxub %xmm4, %xmm2 - pmaxub %xmm0, %xmm2 - pmovmskb %xmm2, %eax - - test %eax, %eax - jz L(align64_loop) - - pmovmskb %xmm4, %eax - test %eax, %eax - jnz L(matches48) - - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%ecx), %xmm2 - - pcmpeqb %xmm1, %xmm2 - pcmpeqb (%ecx), %xmm1 - - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches16) - - pmovmskb %xmm1, %eax - test %ah, %ah - jnz L(exit_dispatch_high) - mov %al, %dl - and $15 << 4, %dl - jnz L(exit_dispatch_8) - test $0x08, %al - jnz L(exit_4) - test $0x04, %al - jnz L(exit_3) - test $0x02, %al - jnz L(exit_2) - mov %ecx, %eax - ret - - .p2align 4 -L(exit_loop): - add $64, %edx - cmp $32, %edx - jbe L(exit_loop_32) - - movdqa 48(%ecx), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48) - - movdqa 32(%ecx), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %eax - test %eax, %eax - jnz L(matches32) - - movdqa 16(%ecx), %xmm3 - pcmpeqb %xmm1, %xmm3 - pmovmskb %xmm3, %eax - test %eax, %eax - jnz L(matches16_1) - cmp $48, %edx - jbe L(return_null) - - pcmpeqb (%ecx), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches0_1) - xor %eax, %eax - ret - - .p2align 4 -L(exit_loop_32): - movdqa 48(%ecx), %xmm0 - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz L(matches48_1) - cmp $16, %edx - jbe L(return_null) - - pcmpeqb 32(%ecx), %xmm1 - pmovmskb %xmm1, %eax - test %eax, %eax - jnz L(matches32_1) - xor %eax, %eax - ret - - .p2align 4 -L(matches16): - lea 16(%ecx), %ecx - test %ah, %ah - jnz L(exit_dispatch_high) - mov %al, %dl - and $15 << 4, %dl - jnz L(exit_dispatch_8) - test $0x08, %al - jnz L(exit_4) - test $0x04, %al - jnz L(exit_3) - test $0x02, %al - jnz L(exit_2) - mov %ecx, %eax - ret - - .p2align 4 -L(matches32): - lea 32(%ecx), %ecx - test %ah, %ah - jnz L(exit_dispatch_high) - mov %al, %dl - and $15 << 4, %dl - jnz L(exit_dispatch_8) - test $0x08, %al - jnz L(exit_4) - test $0x04, %al - jnz L(exit_3) - test $0x02, %al - jnz L(exit_2) - mov %ecx, %eax - ret - - .p2align 4 -L(matches48): - lea 48(%ecx), %ecx - - .p2align 4 -L(exit_dispatch): - test %ah, %ah - jnz L(exit_dispatch_high) - mov %al, %dl - and $15 << 4, %dl - jnz L(exit_dispatch_8) - test $0x08, %al - jnz L(exit_4) - test $0x04, %al - jnz L(exit_3) - test $0x02, %al - jnz L(exit_2) - mov %ecx, %eax - ret - - .p2align 4 -L(exit_dispatch_8): - test $0x80, %al - jnz L(exit_8) - test $0x40, %al - jnz L(exit_7) - test $0x20, %al - jnz L(exit_6) - lea 4(%ecx), %eax - ret - - .p2align 4 -L(exit_dispatch_high): - mov %ah, %dh - and $15 << 4, %dh - jnz L(exit_dispatch_high_8) - test $0x08, %ah - jnz L(exit_12) - test $0x04, %ah - jnz L(exit_11) - test $0x02, %ah - jnz L(exit_10) - lea 8(%ecx), %eax - ret - - .p2align 4 -L(exit_dispatch_high_8): - test $0x80, %ah - jnz L(exit_16) - test $0x40, %ah - jnz L(exit_15) - test $0x20, %ah - jnz L(exit_14) - lea 12(%ecx), %eax - ret - - .p2align 4 -L(exit_2): - lea 1(%ecx), %eax - ret - - .p2align 4 -L(exit_3): - lea 2(%ecx), %eax - ret - - .p2align 4 -L(exit_4): - lea 3(%ecx), %eax - ret - - .p2align 4 -L(exit_6): - lea 5(%ecx), %eax - ret - - .p2align 4 -L(exit_7): - lea 6(%ecx), %eax - ret - - .p2align 4 -L(exit_8): - lea 7(%ecx), %eax - ret - - .p2align 4 -L(exit_10): - lea 9(%ecx), %eax - ret - - .p2align 4 -L(exit_11): - lea 10(%ecx), %eax - ret - - .p2align 4 -L(exit_12): - lea 11(%ecx), %eax - ret - - .p2align 4 -L(exit_14): - lea 13(%ecx), %eax - ret - - .p2align 4 -L(exit_15): - lea 14(%ecx), %eax - ret - - .p2align 4 -L(exit_16): - lea 15(%ecx), %eax - ret - - .p2align 4 -L(matches0_1): - lea -64(%edx), %edx - - test %ah, %ah - jnz L(exit_dispatch_1_high) - mov %al, %ah - and $15 << 4, %ah - jnz L(exit_dispatch_1_8) - test $0x08, %al - jnz L(exit_1_4) - test $0x04, %al - jnz L(exit_1_3) - test $0x02, %al - jnz L(exit_1_2) - add $0, %edx - jl L(return_null) - mov %ecx, %eax - ret - - .p2align 4 -L(matches16_1): - lea -48(%edx), %edx - lea 16(%ecx), %ecx - - test %ah, %ah - jnz L(exit_dispatch_1_high) - mov %al, %ah - and $15 << 4, %ah - jnz L(exit_dispatch_1_8) - test $0x08, %al - jnz L(exit_1_4) - test $0x04, %al - jnz L(exit_1_3) - test $0x02, %al - jnz L(exit_1_2) - add $0, %edx - jl L(return_null) - mov %ecx, %eax - ret - - .p2align 4 -L(matches32_1): - lea -32(%edx), %edx - lea 32(%ecx), %ecx - - test %ah, %ah - jnz L(exit_dispatch_1_high) - mov %al, %ah - and $15 << 4, %ah - jnz L(exit_dispatch_1_8) - test $0x08, %al - jnz L(exit_1_4) - test $0x04, %al - jnz L(exit_1_3) - test $0x02, %al - jnz L(exit_1_2) - add $0, %edx - jl L(return_null) - mov %ecx, %eax - ret - - .p2align 4 -L(matches48_1): - lea -16(%edx), %edx - lea 48(%ecx), %ecx - - .p2align 4 -L(exit_dispatch_1): - test %ah, %ah - jnz L(exit_dispatch_1_high) - mov %al, %ah - and $15 << 4, %ah - jnz L(exit_dispatch_1_8) - test $0x08, %al - jnz L(exit_1_4) - test $0x04, %al - jnz L(exit_1_3) - test $0x02, %al - jnz L(exit_1_2) - add $0, %edx - jl L(return_null) - mov %ecx, %eax - ret - - .p2align 4 -L(exit_dispatch_1_8): - test $0x80, %al - jnz L(exit_1_8) - test $0x40, %al - jnz L(exit_1_7) - test $0x20, %al - jnz L(exit_1_6) - add $4, %edx - jl L(return_null) - lea 4(%ecx), %eax - ret - - .p2align 4 -L(exit_dispatch_1_high): - mov %ah, %al - and $15 << 4, %al - jnz L(exit_dispatch_1_high_8) - test $0x08, %ah - jnz L(exit_1_12) - test $0x04, %ah - jnz L(exit_1_11) - test $0x02, %ah - jnz L(exit_1_10) - add $8, %edx - jl L(return_null) - lea 8(%ecx), %eax - ret - - .p2align 4 -L(exit_dispatch_1_high_8): - test $0x80, %ah - jnz L(exit_1_16) - test $0x40, %ah - jnz L(exit_1_15) - test $0x20, %ah - jnz L(exit_1_14) - add $12, %edx - jl L(return_null) - lea 12(%ecx), %eax - ret - - .p2align 4 -L(exit_1_2): - add $1, %edx - jl L(return_null) - lea 1(%ecx), %eax - ret - - .p2align 4 -L(exit_1_3): - add $2, %edx - jl L(return_null) - lea 2(%ecx), %eax - ret - - .p2align 4 -L(exit_1_4): - add $3, %edx - jl L(return_null) - lea 3(%ecx), %eax - ret - - .p2align 4 -L(exit_1_6): - add $5, %edx - jl L(return_null) - lea 5(%ecx), %eax - ret - - .p2align 4 -L(exit_1_7): - add $6, %edx - jl L(return_null) - lea 6(%ecx), %eax - ret - - .p2align 4 -L(exit_1_8): - add $7, %edx - jl L(return_null) - lea 7(%ecx), %eax - ret - - .p2align 4 -L(exit_1_10): - add $9, %edx - jl L(return_null) - lea 9(%ecx), %eax - ret - - .p2align 4 -L(exit_1_11): - add $10, %edx - jl L(return_null) - lea 10(%ecx), %eax - ret - - .p2align 4 -L(exit_1_12): - add $11, %edx - jl L(return_null) - lea 11(%ecx), %eax - ret - - .p2align 4 -L(exit_1_14): - add $13, %edx - jl L(return_null) - lea 13(%ecx), %eax - ret - - .p2align 4 -L(exit_1_15): - add $14, %edx - jl L(return_null) - lea 14(%ecx), %eax - ret - - .p2align 4 -L(exit_1_16): - add $15, %edx - jl L(return_null) - lea 15(%ecx), %eax - ret - - .p2align 4 -L(return_null): - xor %eax, %eax - ret - - .p2align 4 -L(length_less16_offset0): - mov %dl, %cl - pcmpeqb (%eax), %xmm1 - - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - mov %eax, %ecx - pmovmskb %xmm1, %eax - - and %edx, %eax - test %eax, %eax - jnz L(exit_dispatch) - - xor %eax, %eax - ret - - .p2align 4 -L(length_less16): - punpcklbw %xmm1, %xmm1 - add $16, %edx - je L(return_null) - punpcklbw %xmm1, %xmm1 - - mov %ecx, %eax - pshufd $0, %xmm1, %xmm1 - - and $15, %ecx - jz L(length_less16_offset0) - - PUSH (%edi) - - mov %cl, %dh - add %dl, %dh - and $-16, %eax - - sub $16, %dh - ja L(length_less16_part2) - - pcmpeqb (%eax), %xmm1 - pmovmskb %xmm1, %edi - - sar %cl, %edi - add %ecx, %eax - mov %dl, %cl - - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - and %edx, %edi - test %edi, %edi - jz L(ret_null) - - bsr %edi, %edi - add %edi, %eax - POP (%edi) - ret - - CFI_PUSH (%edi) - - .p2align 4 -L(length_less16_part2): - movdqa 16(%eax), %xmm2 - pcmpeqb %xmm1, %xmm2 - pmovmskb %xmm2, %edi - - mov %cl, %ch - - mov %dh, %cl - mov $1, %edx - sal %cl, %edx - sub $1, %edx - - and %edx, %edi - - test %edi, %edi - jnz L(length_less16_part2_return) - - pcmpeqb (%eax), %xmm1 - pmovmskb %xmm1, %edi - - mov %ch, %cl - sar %cl, %edi - test %edi, %edi - jz L(ret_null) - - bsr %edi, %edi - add %edi, %eax - xor %ch, %ch - add %ecx, %eax - POP (%edi) - ret - - CFI_PUSH (%edi) - - .p2align 4 -L(length_less16_part2_return): - bsr %edi, %edi - lea 16(%eax, %edi), %eax - POP (%edi) - ret - - CFI_PUSH (%edi) - - .p2align 4 -L(ret_null): - xor %eax, %eax - POP (%edi) - ret - -END (__memrchr_sse2) -#endif diff --git a/sysdeps/i386/i686/multiarch/memrchr.S b/sysdeps/i386/i686/multiarch/memrchr.S deleted file mode 100644 index 32fb1a6792..0000000000 --- a/sysdeps/i386/i686/multiarch/memrchr.S +++ /dev/null @@ -1,45 +0,0 @@ -/* Multiple versions of memrchr - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2011-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -#if IS_IN (libc) - .text -ENTRY(__memrchr) - .type __memrchr, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - HAS_CPU_FEATURE (SSE2) - jz 2f - HAS_ARCH_FEATURE (Slow_BSF) - jz 3f - - LOAD_FUNC_GOT_EAX (__memrchr_sse2) - ret - -2: LOAD_FUNC_GOT_EAX (__memrchr_ia32) - ret - -3: LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf) - ret -END(__memrchr) - -weak_alias(__memrchr, memrchr) -#endif diff --git a/sysdeps/i386/multiarch/Makefile b/sysdeps/i386/multiarch/Makefile index 577bbb6c65..fc9364fe65 100644 --- a/sysdeps/i386/multiarch/Makefile +++ b/sysdeps/i386/multiarch/Makefile @@ -19,5 +19,6 @@ sysdep_routines += bcopy-i386 bcopy-i686 bcopy-sse2-unaligned \ memset-i386 memset-i586 memset-i686 \ memset-sse2 memset-sse2-rep \ memchr-sse2-bsf memchr-sse2 \ - memcmp-i386 memcmp-i686 memcmp-ssse3 memcmp-sse4 + memcmp-i386 memcmp-i686 memcmp-ssse3 memcmp-sse4 \ + memrchr-i386 memrchr-sse2-bsf memrchr-sse2 endif diff --git a/sysdeps/i386/multiarch/ifunc-impl-list.c b/sysdeps/i386/multiarch/ifunc-impl-list.c index e57134d340..15c2009e62 100644 --- a/sysdeps/i386/multiarch/ifunc-impl-list.c +++ b/sysdeps/i386/multiarch/ifunc-impl-list.c @@ -116,15 +116,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, #endif ) -#if 0 /* Support sysdeps/i386/i686/multiarch/memrchr.S. */ IFUNC_IMPL (i, name, memrchr, IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2), __memrchr_sse2_bsf) IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2), __memrchr_sse2) - IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32)) -#endif + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_i386)) /* Support sysdeps/i386/i686/multiarch/memset_chk.S. */ IFUNC_IMPL (i, name, __memset_chk, diff --git a/sysdeps/i386/multiarch/memrchr-i386.c b/sysdeps/i386/multiarch/memrchr-i386.c new file mode 100644 index 0000000000..f8cef0712b --- /dev/null +++ b/sysdeps/i386/multiarch/memrchr-i386.c @@ -0,0 +1,7 @@ +#if IS_IN (libc) +# define MEMRCHR __memrchr_i386 +# include +extern void *__memrchr_i386 (const void *, int, size_t); +#endif + +#include "string/memrchr.c" diff --git a/sysdeps/i386/multiarch/memrchr-sse2-bsf.S b/sysdeps/i386/multiarch/memrchr-sse2-bsf.S new file mode 100644 index 0000000000..043e1bbd23 --- /dev/null +++ b/sysdeps/i386/multiarch/memrchr-sse2-bsf.S @@ -0,0 +1,417 @@ +/* Optimized memrchr with sse2 + Copyright (C) 2011-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# define MEMCHR __memrchr_sse2_bsf + + .text +ENTRY (MEMCHR) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + add $16, %ecx + add $16, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches0) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + add $64, %ecx + add $64, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + bsr %eax, %eax + + add %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsr %eax, %eax + add %ecx, %eax + ret + + .p2align 4 +L(matches16): + bsr %eax, %eax + lea 16(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches32): + bsr %eax, %eax + lea 32(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches48): + bsr %eax, %eax + lea 48(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches0_1): + bsr %eax, %eax + sub $64, %edx + add %eax, %edx + jl L(return_null) + add %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + bsr %eax, %eax + sub $48, %edx + add %eax, %edx + jl L(return_null) + lea 16(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches32_1): + bsr %eax, %eax + sub $32, %edx + add %eax, %edx + jl L(return_null) + lea 32(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches48_1): + bsr %eax, %eax + sub $16, %edx + add %eax, %edx + jl L(return_null) + lea 48(%ecx, %eax), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + mov %edx, %ecx + + pmovmskb %xmm1, %edx + + and %ecx, %edx + test %edx, %edx + jz L(return_null) + + bsr %edx, %ecx + add %ecx, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + mov %ecx, %eax + punpcklbw %xmm1, %xmm1 + add $16, %edx + jz L(return_null) + + pshufd $0, %xmm1, %xmm1 + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (MEMCHR) +#endif diff --git a/sysdeps/i386/multiarch/memrchr-sse2.S b/sysdeps/i386/multiarch/memrchr-sse2.S new file mode 100644 index 0000000000..65e3c8bc73 --- /dev/null +++ b/sysdeps/i386/multiarch/memrchr-sse2.S @@ -0,0 +1,724 @@ +/* Optimized memrchr with sse2 without bsf + Copyright (C) 2011-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) + +# include +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + + atom_text_section +ENTRY (__memrchr_sse2) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(exit_dispatch) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + lea 16(%ecx), %ecx + lea 16(%edx), %edx + sub %eax, %edx + and $-16, %ecx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(exit_dispatch) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(exit_dispatch) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + lea 64(%ecx), %ecx + lea 64(%edx), %edx + and $-64, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches16): + lea 16(%ecx), %ecx + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(matches32): + lea 32(%ecx), %ecx + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(matches48): + lea 48(%ecx), %ecx + + .p2align 4 +L(exit_dispatch): + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_dispatch_8): + test $0x80, %al + jnz L(exit_8) + test $0x40, %al + jnz L(exit_7) + test $0x20, %al + jnz L(exit_6) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_high): + mov %ah, %dh + and $15 << 4, %dh + jnz L(exit_dispatch_high_8) + test $0x08, %ah + jnz L(exit_12) + test $0x04, %ah + jnz L(exit_11) + test $0x02, %ah + jnz L(exit_10) + lea 8(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_high_8): + test $0x80, %ah + jnz L(exit_16) + test $0x40, %ah + jnz L(exit_15) + test $0x20, %ah + jnz L(exit_14) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit_2): + lea 1(%ecx), %eax + ret + + .p2align 4 +L(exit_3): + lea 2(%ecx), %eax + ret + + .p2align 4 +L(exit_4): + lea 3(%ecx), %eax + ret + + .p2align 4 +L(exit_6): + lea 5(%ecx), %eax + ret + + .p2align 4 +L(exit_7): + lea 6(%ecx), %eax + ret + + .p2align 4 +L(exit_8): + lea 7(%ecx), %eax + ret + + .p2align 4 +L(exit_10): + lea 9(%ecx), %eax + ret + + .p2align 4 +L(exit_11): + lea 10(%ecx), %eax + ret + + .p2align 4 +L(exit_12): + lea 11(%ecx), %eax + ret + + .p2align 4 +L(exit_14): + lea 13(%ecx), %eax + ret + + .p2align 4 +L(exit_15): + lea 14(%ecx), %eax + ret + + .p2align 4 +L(exit_16): + lea 15(%ecx), %eax + ret + + .p2align 4 +L(matches0_1): + lea -64(%edx), %edx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + lea -48(%edx), %edx + lea 16(%ecx), %ecx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches32_1): + lea -32(%edx), %edx + lea 32(%ecx), %ecx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches48_1): + lea -16(%edx), %edx + lea 48(%ecx), %ecx + + .p2align 4 +L(exit_dispatch_1): + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_dispatch_1_8): + test $0x80, %al + jnz L(exit_1_8) + test $0x40, %al + jnz L(exit_1_7) + test $0x20, %al + jnz L(exit_1_6) + add $4, %edx + jl L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_1_high): + mov %ah, %al + and $15 << 4, %al + jnz L(exit_dispatch_1_high_8) + test $0x08, %ah + jnz L(exit_1_12) + test $0x04, %ah + jnz L(exit_1_11) + test $0x02, %ah + jnz L(exit_1_10) + add $8, %edx + jl L(return_null) + lea 8(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_1_high_8): + test $0x80, %ah + jnz L(exit_1_16) + test $0x40, %ah + jnz L(exit_1_15) + test $0x20, %ah + jnz L(exit_1_14) + add $12, %edx + jl L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit_1_2): + add $1, %edx + jl L(return_null) + lea 1(%ecx), %eax + ret + + .p2align 4 +L(exit_1_3): + add $2, %edx + jl L(return_null) + lea 2(%ecx), %eax + ret + + .p2align 4 +L(exit_1_4): + add $3, %edx + jl L(return_null) + lea 3(%ecx), %eax + ret + + .p2align 4 +L(exit_1_6): + add $5, %edx + jl L(return_null) + lea 5(%ecx), %eax + ret + + .p2align 4 +L(exit_1_7): + add $6, %edx + jl L(return_null) + lea 6(%ecx), %eax + ret + + .p2align 4 +L(exit_1_8): + add $7, %edx + jl L(return_null) + lea 7(%ecx), %eax + ret + + .p2align 4 +L(exit_1_10): + add $9, %edx + jl L(return_null) + lea 9(%ecx), %eax + ret + + .p2align 4 +L(exit_1_11): + add $10, %edx + jl L(return_null) + lea 10(%ecx), %eax + ret + + .p2align 4 +L(exit_1_12): + add $11, %edx + jl L(return_null) + lea 11(%ecx), %eax + ret + + .p2align 4 +L(exit_1_14): + add $13, %edx + jl L(return_null) + lea 13(%ecx), %eax + ret + + .p2align 4 +L(exit_1_15): + add $14, %edx + jl L(return_null) + lea 14(%ecx), %eax + ret + + .p2align 4 +L(exit_1_16): + add $15, %edx + jl L(return_null) + lea 15(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + mov %eax, %ecx + pmovmskb %xmm1, %eax + + and %edx, %eax + test %eax, %eax + jnz L(exit_dispatch) + + xor %eax, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + add $16, %edx + je L(return_null) + punpcklbw %xmm1, %xmm1 + + mov %ecx, %eax + pshufd $0, %xmm1, %xmm1 + + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (__memrchr_sse2) +#endif diff --git a/sysdeps/i386/multiarch/memrchr.S b/sysdeps/i386/multiarch/memrchr.S new file mode 100644 index 0000000000..be8deae4e6 --- /dev/null +++ b/sysdeps/i386/multiarch/memrchr.S @@ -0,0 +1,45 @@ +/* Multiple versions of memrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +#if IS_IN (libc) + .text +ENTRY(__memrchr) + .type __memrchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX (__memrchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__memrchr_i386) + ret + +3: LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf) + ret +END(__memrchr) + +weak_alias(__memrchr, memrchr) +#endif -- cgit 1.4.1