diff options
author | Liubov Dmitrieva <liubov.dmitrieva@intel.com> | 2011-09-05 17:11:11 -0400 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-09-05 17:11:11 -0400 |
commit | 693fb948841c7564ce3cd1ec4b31c0130abc8c42 (patch) | |
tree | e67f897cbe65d0f9829d85d70004ffad96a9dd70 /sysdeps/i386/i686/multiarch/strchr-sse2.S | |
parent | 49d42c37ba4f688ed442bfa0ff54e851b58e607b (diff) | |
download | glibc-693fb948841c7564ce3cd1ec4b31c0130abc8c42.tar.gz glibc-693fb948841c7564ce3cd1ec4b31c0130abc8c42.tar.xz glibc-693fb948841c7564ce3cd1ec4b31c0130abc8c42.zip |
Optimized strchr and strrchr with SSE2 on x86-32
Diffstat (limited to 'sysdeps/i386/i686/multiarch/strchr-sse2.S')
-rw-r--r-- | sysdeps/i386/i686/multiarch/strchr-sse2.S | 350 |
1 files changed, 350 insertions, 0 deletions
diff --git a/sysdeps/i386/i686/multiarch/strchr-sse2.S b/sysdeps/i386/i686/multiarch/strchr-sse2.S new file mode 100644 index 0000000000..7702210b1c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strchr-sse2.S @@ -0,0 +1,350 @@ +/* strchr SSE2 without bsf + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi) +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + .text +ENTRY (__strchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $15, %ecx + pshufd $0, %xmm1, %xmm1 + je L(loop) + +/* Handle unaligned string. */ + and $-16, %edi + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + jz L(unaligned_no_match) + /* Check which byte is a match. */ + /* Is there a NULL? */ + add %ecx, %edi + test %edx, %edx + jz L(match_case1) + jmp L(match_case2) + + .p2align 4 +L(unaligned_no_match): + test %edx, %edx + jne L(return_null) + + pxor %xmm2, %xmm2 + add $16, %edi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + jmp L(loop) + +L(matches): + /* There is a match. First find where NULL is. */ + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_higth_case2) + + mov %al, %cl + and $15, %cl + jnz L(match_case2_4) + + mov %dl, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %al + jnz L(Exit5) + test $0x10, %dl + jnz L(return_null) + test $0x20, %al + jnz L(Exit6) + test $0x20, %dl + jnz L(return_null) + test $0x40, %al + jnz L(Exit7) + test $0x40, %dl + jnz L(return_null) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_4): + test $0x01, %al + jnz L(Exit1) + test $0x01, %dl + jnz L(return_null) + test $0x02, %al + jnz L(Exit2) + test $0x02, %dl + jnz L(return_null) + test $0x04, %al + jnz L(Exit3) + test $0x04, %dl + jnz L(return_null) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_higth_case2): + test %dl, %dl + jnz L(return_null) + + mov %ah, %cl + and $15, %cl + jnz L(match_case2_12) + + mov %dh, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %ah + jnz L(Exit13) + test $0x10, %dh + jnz L(return_null) + test $0x20, %ah + jnz L(Exit14) + test $0x20, %dh + jnz L(return_null) + test $0x40, %ah + jnz L(Exit15) + test $0x40, %dh + jnz L(return_null) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_12): + test $0x01, %ah + jnz L(Exit9) + test $0x01, %dh + jnz L(return_null) + test $0x02, %ah + jnz L(Exit10) + test $0x02, %dh + jnz L(return_null) + test $0x04, %ah + jnz L(Exit11) + test $0x04, %dh + jnz L(return_null) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_higth_case1) + + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_higth_case1): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(Exit1): + lea (%edi), %eax + RETURN + + .p2align 4 +L(Exit2): + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(Exit3): + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(Exit4): + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(Exit5): + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(Exit6): + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(Exit7): + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(Exit9): + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(Exit10): + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(Exit11): + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(Exit12): + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(Exit13): + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(Exit14): + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(Exit15): + lea 14(%edi), %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + +END (__strchr_sse2) +#endif + |