diff options
author | Ulrich Drepper <drepper@gmail.com> | 2011-12-17 14:39:23 -0500 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-12-17 14:39:23 -0500 |
commit | 1d3e4b618ae0217f1736753f3085f9c4fcc827bf (patch) | |
tree | 90a3f8d19f941a684e1482b8813c534d82cfb19e /sysdeps/i386/i686/multiarch/wcschr-sse2.S | |
parent | a2d18b64edb486825fb5946eefc2131426ccfec9 (diff) | |
download | glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.gz glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.xz glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.zip |
Optimized wcschr and wcscpy for x86-64 and x86-32
Diffstat (limited to 'sysdeps/i386/i686/multiarch/wcschr-sse2.S')
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcschr-sse2.S | 220 |
1 files changed, 220 insertions, 0 deletions
diff --git a/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/sysdeps/i386/i686/multiarch/wcschr-sse2.S new file mode 100644 index 0000000000..cc8204cfe3 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcschr-sse2.S @@ -0,0 +1,220 @@ +/* wcschr with SSE2, without using bsf instructions + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcschr_sse2) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %eax + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + + and $63, %eax + cmp $48, %eax + ja L(cross_cache) + + movdqu (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + and $-16, %ecx + jmp L(loop) + + .p2align 4 +L(cross_cache): + PUSH (%edi) + mov %ecx, %edi + mov %eax, %ecx + and $-16, %edi + and $15, %ecx + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + jz L(unaligned_no_match) + + add %edi, %ecx + POP (%edi) + + test %edx, %edx + jz L(match_case1) + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_no_match): + mov %edi, %ecx + POP (%edi) + + test %edx, %edx + jnz L(return_null) + + pxor %xmm2, %xmm2 + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + add $16, %ecx + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jz L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %edx + test %eax, %eax + jz L(return_null) + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_case2_4): + mov %ecx, %eax + ret + + .p2align 4 +L(match_higth_case2): + test %dl, %dl + jnz L(return_null) + test $15, %ah + jnz L(match_case2_12) + test $15, %dh + jnz L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(match_case2_12): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_higth_case1) + + test $0x01, %al + jnz L(exit0) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_higth_case1): + test $0x01, %ah + jnz L(exit3) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit0): + mov %ecx, %eax + ret + + .p2align 4 +L(exit3): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + +END (__wcschr_sse2) +#endif |