From 693fb948841c7564ce3cd1ec4b31c0130abc8c42 Mon Sep 17 00:00:00 2001 From: Liubov Dmitrieva Date: Mon, 5 Sep 2011 17:11:11 -0400 Subject: Optimized strchr and strrchr with SSE2 on x86-32 --- sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S | 159 ++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S (limited to 'sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S') diff --git a/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S new file mode 100644 index 0000000000..5a19ba26bc --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S @@ -0,0 +1,159 @@ +/* strchr with SSE2 with bsf + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi) +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + .text +ENTRY (__strchr_sse2_bsf) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $15, %ecx + pshufd $0, %xmm1, %xmm1 + je L(loop) + +/* Handle unaligned string. */ + and $-16, %edi + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + je L(unaligned_no_match) + /* Check which byte is a match. */ + bsf %eax, %eax + /* Is there a NULL? */ + test %edx, %edx + je L(unaligned_match) + bsf %edx, %edx + cmpl %edx, %eax + /* Return NULL if NULL comes first. */ + ja L(return_null) +L(unaligned_match): + add %edi, %eax + add %ecx, %eax + RETURN + + .p2align 4 +L(unaligned_no_match): + test %edx, %edx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %edi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + jmp L(loop) + +L(matches): + pmovmskb %xmm2, %edx + test %eax, %eax + jz L(return_null) + bsf %eax, %eax + /* There is a match. First find where NULL is. */ + test %edx, %edx + je L(match) + bsf %edx, %ecx + /* Check if NULL comes first. */ + cmpl %ecx, %eax + ja L(return_null) +L(match): + sub $16, %edi + add %edi, %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + +END (__strchr_sse2_bsf) +#endif -- cgit 1.4.1