diff options
author | Ulrich Drepper <drepper@gmail.com> | 2011-12-17 14:39:23 -0500 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-12-17 14:39:23 -0500 |
commit | 1d3e4b618ae0217f1736753f3085f9c4fcc827bf (patch) | |
tree | 90a3f8d19f941a684e1482b8813c534d82cfb19e /sysdeps | |
parent | a2d18b64edb486825fb5946eefc2131426ccfec9 (diff) | |
download | glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.gz glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.xz glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.zip |
Optimized wcschr and wcscpy for x86-64 and x86-32
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/i386/i686/multiarch/Makefile | 3 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcschr-c.c | 8 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcschr-sse2.S | 220 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcschr.S | 54 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcscpy-c.c | 5 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcscpy-ssse3.S | 621 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcscpy.S | 46 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcsrchr-c.c | 5 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcsrchr-sse2.S | 355 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/wcsrchr.S | 54 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 6 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wcscpy-c.c | 5 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wcscpy-ssse3.S | 566 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wcscpy.S | 43 | ||||
-rw-r--r-- | sysdeps/x86_64/wcschr.S | 155 | ||||
-rw-r--r-- | sysdeps/x86_64/wcsrchr.S | 283 |
16 files changed, 2427 insertions, 2 deletions
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 426b718e47..b764e5b825 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -37,7 +37,8 @@ endif ifeq ($(subdir),wcsmbs) sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \ - wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c + wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \ + wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c endif ifeq (mathyes,$(subdir)$(config-cflags-avx)) diff --git a/sysdeps/i386/i686/multiarch/wcschr-c.c b/sysdeps/i386/i686/multiarch/wcschr-c.c new file mode 100644 index 0000000000..a63e50e283 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcschr-c.c @@ -0,0 +1,8 @@ +#ifndef NOT_IN_libc +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); +# define WCSCHR __wcschr_ia32 +#endif + +#include "wcsmbs/wcschr.c" diff --git a/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/sysdeps/i386/i686/multiarch/wcschr-sse2.S new file mode 100644 index 0000000000..cc8204cfe3 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcschr-sse2.S @@ -0,0 +1,220 @@ +/* wcschr with SSE2, without using bsf instructions + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcschr_sse2) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %eax + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + + and $63, %eax + cmp $48, %eax + ja L(cross_cache) + + movdqu (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + and $-16, %ecx + jmp L(loop) + + .p2align 4 +L(cross_cache): + PUSH (%edi) + mov %ecx, %edi + mov %eax, %ecx + and $-16, %edi + and $15, %ecx + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + jz L(unaligned_no_match) + + add %edi, %ecx + POP (%edi) + + test %edx, %edx + jz L(match_case1) + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_no_match): + mov %edi, %ecx + POP (%edi) + + test %edx, %edx + jnz L(return_null) + + pxor %xmm2, %xmm2 + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + add $16, %ecx + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jz L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %edx + test %eax, %eax + jz L(return_null) + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_case2_4): + mov %ecx, %eax + ret + + .p2align 4 +L(match_higth_case2): + test %dl, %dl + jnz L(return_null) + test $15, %ah + jnz L(match_case2_12) + test $15, %dh + jnz L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(match_case2_12): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_higth_case1) + + test $0x01, %al + jnz L(exit0) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_higth_case1): + test $0x01, %ah + jnz L(exit3) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit0): + mov %ecx, %eax + ret + + .p2align 4 +L(exit3): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + +END (__wcschr_sse2) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcschr.S b/sysdeps/i386/i686/multiarch/wcschr.S new file mode 100644 index 0000000000..bf0d6d5754 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcschr.S @@ -0,0 +1,54 @@ +/* Multiple versions of wcschr + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(wcschr) + .type wcschr, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wcschr_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wcschr_sse2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4); + cfi_restore (ebx) + ret +END(wcschr) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcscpy-c.c b/sysdeps/i386/i686/multiarch/wcscpy-c.c new file mode 100644 index 0000000000..a3c4024c01 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcscpy-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define wcscpy __wcscpy_ia32 +#endif + +#include "wcsmbs/wcscpy.c" diff --git a/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S new file mode 100644 index 0000000000..84d92a8bde --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S @@ -0,0 +1,621 @@ +/* wcscpy with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define RETURN POP (%edi); ret; CFI_PUSH (%edi) +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + + atom_text_section +ENTRY (__wcscpy_ssse3) + mov STR1(%esp), %edx + mov STR2(%esp), %ecx + + cmp $0, (%ecx) + jz L(ExitTail4) + cmp $0, 4(%ecx) + jz L(ExitTail8) + cmp $0, 8(%ecx) + jz L(ExitTail12) + cmp $0, 12(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + mov %edx, %edi + + PUSH (%esi) + lea 16(%ecx), %esi + + and $-16, %esi + + pxor %xmm0, %xmm0 + pcmpeqd (%esi), %xmm0 + movdqu (%ecx), %xmm1 + movdqu %xmm1, (%edx) + + pmovmskb %xmm0, %eax + sub %ecx, %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %edx, %eax + lea 16(%edx), %edx + and $-16, %edx + sub %edx, %eax + + sub %eax, %ecx + mov %ecx, %eax + and $0xf, %eax + mov $0, %esi + + jz L(Align16Both) + cmp $4, %eax + je L(Shl4) + cmp $8, %eax + je L(Shl8) + jmp L(Shl12) + +L(Align16Both): + movaps (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movaps %xmm1, (%edx) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm4 + movaps %xmm3, (%edx, %esi) + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm1 + movaps %xmm4, (%edx, %esi) + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm2 + movaps %xmm1, (%edx, %esi) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%edx, %esi) + mov %ecx, %eax + lea 16(%ecx, %esi), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx + + mov $-0x40, %esi + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps 32(%ecx), %xmm3 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + lea 64(%edx), %edx + pcmpeqd %xmm0, %xmm3 + lea 64(%ecx), %ecx + pmovmskb %xmm3, %eax + + test %eax, %eax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%edx) + movaps %xmm5, -48(%edx) + movaps %xmm6, -32(%edx) + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%edx) + pcmpeqd %xmm7, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + mov $-0x40, %esi + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + + .p2align 4 +L(Shl4): + movaps -4(%ecx), %xmm1 + movaps 12(%ecx), %xmm2 +L(Shl4Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 28(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -12(%ecx), %ecx + sub %eax, %edx + + movaps -4(%ecx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%ecx), %xmm2 + movaps 28(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %eax, %eax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) + + palignr $4, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movaps (%edx), %xmm6 + psrldq $12, %xmm6 + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%edx) + add $12, %edx + add $12, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(Shl8): + movaps -8(%ecx), %xmm1 + movaps 8(%ecx), %xmm2 +L(Shl8Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 24(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -8(%ecx), %ecx + sub %eax, %edx + + movaps -8(%ecx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%ecx), %xmm2 + movaps 24(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %eax, %eax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) + + palignr $8, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movaps (%edx), %xmm6 + psrldq $8, %xmm6 + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%edx) + add $8, %edx + add $8, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(Shl12): + movaps -12(%ecx), %xmm1 + movaps 4(%ecx), %xmm2 +L(Shl12Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + lea 20(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -4(%ecx), %ecx + sub %eax, %edx + + movaps -12(%ecx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%ecx), %xmm2 + movaps 20(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %eax, %eax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) + + palignr $12, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movaps (%edx), %xmm6 + psrldq $4, %xmm6 + mov $4, %esi + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%edx) + + .p2align 4 +L(CopyFrom1To16Bytes): + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit12) + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edi, %eax + RETURN + +CFI_POP (%edi) + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + movl %edx, %eax + ret + +END (__wcscpy_ssse3) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcscpy.S b/sysdeps/i386/i686/multiarch/wcscpy.S new file mode 100644 index 0000000000..c7bafbe82a --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcscpy.S @@ -0,0 +1,46 @@ +/* Multiple versions of wcscpy + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(wcscpy) + .type wcscpy, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wcscpy_ia32@GOTOFF(%ebx), %eax + testl $bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wcscpy_ssse3@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + ret +END(wcscpy) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/sysdeps/i386/i686/multiarch/wcsrchr-c.c new file mode 100644 index 0000000000..c7444ce89b --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcsrchr-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define wcsrchr __wcsrchr_ia32 +#endif + +#include "wcsmbs/wcsrchr.c" diff --git a/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S new file mode 100644 index 0000000000..2859f7e9f3 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S @@ -0,0 +1,355 @@ +/* wcsrchr with SSE2, without using bsf instructions. + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH (%edi); +# define RETURN POP (%edi); ret; CFI_PUSH (%edi); +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcsrchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %edi + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + +/* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + ja L(crosscache) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 +/* Find where NULL is. */ + pmovmskb %xmm2, %ecx +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match1) + + test %ecx, %ecx + jnz L(return_null) + + and $-16, %edi + + PUSH (%esi) + + xor %edx, %edx + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(unaligned_match1): + test %ecx, %ecx + jnz L(prolog_find_zero_1) + + PUSH (%esi) + +/* Save current match */ + mov %eax, %edx + mov %edi, %esi + and $-16, %edi + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(crosscache): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm3 + pcmpeqd %xmm1, %xmm0 +/* Find where NULL is. */ + pmovmskb %xmm3, %edx +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + PUSH (%esi) + + xor %edx, %edx + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(prolog_find_zero) + + PUSH (%esi) + + mov %eax, %edx + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm3 + pcmpeqd %xmm3, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm4 + pcmpeqd %xmm4, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm4 + pmovmskb %xmm2, %ecx + pmovmskb %xmm4, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm5 + pcmpeqd %xmm5, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm5 + pmovmskb %xmm2, %ecx + pmovmskb %xmm5, %eax + or %eax, %ecx + jz L(loop) + + .p2align 4 +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %edx, %edx + jz L(return_null_1) + mov %edx, %eax + mov %esi, %edi + + POP (%esi) + + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(return_null_1): + POP (%esi) + + xor %eax, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(find_zero) +/* save match info */ + mov %eax, %edx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(find_zero): + test %cl, %cl + jz L(find_zero_in_third_or_fourth_wchar) + test $15, %cl + jz L(find_zero_in_second_wchar) + and $1, %eax + jz L(return_value) + + POP (%esi) + + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_second_wchar): + and $1 << 5 - 1, %eax + jz L(return_value) + + POP (%esi) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_third_or_fourth_wchar): + test $15, %ch + jz L(find_zero_in_fourth_wchar) + and $1 << 9 - 1, %eax + jz L(return_value) + + POP (%esi) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_fourth_wchar): + + POP (%esi) + + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(match_second_wchar): + lea -12(%edi), %eax + RETURN + + .p2align 4 +L(match_third_or_fourth_wchar): + test $15 << 4, %ah + jnz L(match_fourth_wchar) + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_third_wchar): + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_fourth_wchar): + lea -4(%edi), %eax + RETURN + + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero): + add %ecx, %edi + mov %edx, %ecx +L(prolog_find_zero_1): + test %cl, %cl + jz L(prolog_find_zero_in_third_or_fourth_wchar) + test $15, %cl + jz L(prolog_find_zero_in_second_wchar) + and $1, %eax + jz L(return_null) + + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_second_wchar): + and $1 << 5 - 1, %eax + jz L(return_null) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_third_or_fourth_wchar): + test $15, %ch + jz L(prolog_find_zero_in_fourth_wchar) + and $1 << 9 - 1, %eax + jz L(return_null) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_fourth_wchar): + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + +END (__wcsrchr_sse2) +#endif diff --git a/sysdeps/i386/i686/multiarch/wcsrchr.S b/sysdeps/i386/i686/multiarch/wcsrchr.S new file mode 100644 index 0000000000..8240063dd6 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/wcsrchr.S @@ -0,0 +1,54 @@ +/* Multiple versions of wcsrchr + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef NOT_IN_libc + .section .gnu.linkonce.t.__i686.get_pc_thunk.bx,"ax",@progbits + .globl __i686.get_pc_thunk.bx + .hidden __i686.get_pc_thunk.bx + .p2align 4 + .type __i686.get_pc_thunk.bx,@function +__i686.get_pc_thunk.bx: + movl (%esp), %ebx + ret + + .text +ENTRY(wcsrchr) + .type wcsrchr, @gnu_indirect_function + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + call __i686.get_pc_thunk.bx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + cmpl $0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx) + jne 1f + call __init_cpu_features +1: leal __wcsrchr_ia32@GOTOFF(%ebx), %eax + testl $bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx) + jz 2f + leal __wcsrchr_sse2@GOTOFF(%ebx), %eax +2: popl %ebx + cfi_adjust_cfa_offset (-4); + cfi_restore (ebx) + ret +END(wcsrchr) +#endif diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 4cf4cf4b28..9a183f068e 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strcat-sse2-unaligned strncat-sse2-unaligned \ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \ - memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c + memcmp-ssse3 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 @@ -28,3 +28,7 @@ CFLAGS-strcasestr.c += -msse4 CFLAGS-strcasestr-nonascii.c += -msse4 endif endif + +ifeq ($(subdir),wcsmbs) +sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c +endif diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c new file mode 100644 index 0000000000..f27c069198 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscpy-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define wcscpy __wcscpy_sse2 +#endif + +#include "wcsmbs/wcscpy.c" diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S new file mode 100644 index 0000000000..133700fbe4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -0,0 +1,566 @@ +/* wcscpy with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> + +.text +ENTRY (__wcscpy_ssse3) + mov %rsi, %rcx + mov %rdi, %rdx + + cmpl $0, (%rcx) + jz L(Exit4) + cmpl $0, 4(%rcx) + jz L(Exit8) + cmpl $0, 8(%rcx) + jz L(Exit12) + cmpl $0, 12(%rcx) + jz L(Exit16) + + lea 16(%rcx), %rsi + and $-16, %rsi + + pxor %xmm0, %xmm0 + mov (%rcx), %r9 + mov %r9, (%rdx) + + pcmpeqd (%rsi), %xmm0 + mov 8(%rcx), %r9 + mov %r9, 8(%rdx) + + pmovmskb %xmm0, %rax + sub %rcx, %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov %rdx, %rax + lea 16(%rdx), %rdx + and $-16, %rdx + sub %rdx, %rax + sub %rax, %rcx + mov %rcx, %rax + and $0xf, %rax + mov $0, %rsi + +/* case: rcx_offset == rdx_offset */ + + jz L(Align16Both) + + cmp $4, %rax + je L(Shl4) + cmp $8, %rax + je L(Shl8) + jmp L(Shl12) + +L(Align16Both): + movaps (%rcx), %xmm1 + movaps 16(%rcx), %xmm2 + movaps %xmm1, (%rdx) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm4 + movaps %xmm3, (%rdx, %rsi) + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm1 + movaps %xmm4, (%rdx, %rsi) + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm2 + movaps %xmm1, (%rdx, %rsi) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%rdx, %rsi) + mov %rcx, %rax + lea 16(%rcx, %rsi), %rcx + and $-0x40, %rcx + sub %rcx, %rax + sub %rax, %rdx + + mov $-0x40, %rsi + +L(Aligned64Loop): + movaps (%rcx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rcx), %xmm5 + movaps 32(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rcx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqd %xmm0, %xmm3 + pmovmskb %xmm3, %rax + lea 64(%rdx), %rdx + lea 64(%rcx), %rcx + test %rax, %rax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%rdx) + movaps %xmm5, -48(%rdx) + movaps %xmm6, -32(%rdx) + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %rax + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm5, %xmm0 + + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm6, %xmm0 + + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%rdx) + pcmpeqd %xmm7, %xmm0 + + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov $-0x40, %rsi + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + + .p2align 4 +L(Shl4): + movaps -4(%rcx), %xmm1 + movaps 12(%rcx), %xmm2 +L(Shl4Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 28(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -12(%rcx), %rcx + sub %rax, %rdx + + movaps -4(%rcx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%rcx), %xmm2 + movaps 28(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %rax, %rax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) + + palignr $4, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movaps (%rdx), %xmm6 + psrldq $12, %xmm6 + mov $12, %rsi + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%rcx), %xmm1 + movaps 8(%rcx), %xmm2 +L(Shl8Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 24(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -8(%rcx), %rcx + sub %rax, %rdx + + movaps -8(%rcx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%rcx), %xmm2 + movaps 24(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %rax, %rax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) + + palignr $8, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movaps (%rdx), %xmm6 + psrldq $8, %xmm6 + mov $8, %rsi + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%rcx), %xmm1 + movaps 4(%rcx), %xmm2 +L(Shl12Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 20(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -4(%rcx), %rcx + sub %rax, %rdx + + movaps -12(%rcx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%rcx), %xmm2 + movaps 20(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %rax, %rax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) + palignr $12, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movaps (%rdx), %xmm6 + psrldq $4, %xmm6 + mov $4, %rsi + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + + .p2align 4 +L(CopyFrom1To16Bytes): + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + + mov (%rcx), %rax + mov %rax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit12) + + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit4): + movl (%rcx), %eax + movl %eax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit8): + mov (%rcx), %rax + mov %rax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit12): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit16): + movdqu (%rcx), %xmm0 + movdqu %xmm0, (%rdx) + mov %rdi, %rax + ret + +END(__wcscpy_ssse3) +#endif + diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S new file mode 100644 index 0000000000..818c5549e6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscpy.S @@ -0,0 +1,43 @@ +/* Multiple versions of wcscpy + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + + .text +ENTRY(wcscpy) + .type wcscpy, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features + +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jnz 2f + leaq __wcscpy_sse2(%rip), %rax + ret + +2: leaq __wcscpy_ssse3(%rip), %rax + ret + +END(wcscpy) +#endif diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S new file mode 100644 index 0000000000..b3a1b3b713 --- /dev/null +++ b/sysdeps/x86_64/wcschr.S @@ -0,0 +1,155 @@ +/* wcschr with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + + .text +ENTRY (wcschr) + + movd %rsi, %xmm1 + pxor %xmm2, %xmm2 + mov %rdi, %rcx + punpckldq %xmm1, %xmm1 + punpckldq %xmm1, %xmm1 + + and $63, %rcx + cmp $48, %rcx + ja L(cross_cache) + + movdqu (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + and $-16, %rdi + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + jmp L(loop) + +L(cross_cache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + + sar %cl, %rdx + sar %cl, %rax + test %rax, %rax + je L(unaligned_no_match) + + bsf %rax, %rax + test %rdx, %rdx + je L(unaligned_match) + bsf %rdx, %rdx + cmp %rdx, %rax + ja L(return_null) + +L(unaligned_match): + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + test %rdx, %rdx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %rdi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + jmp L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %rdx + test %rax, %rax + jz L(return_null) + bsf %rax, %rax + test %rdx, %rdx + je L(match) + bsf %rdx, %rcx + cmp %rcx, %rax + ja L(return_null) +L(match): + sub $16, %rdi + add %rdi, %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +END (wcschr) + +libc_hidden_def(wcschr) diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S new file mode 100644 index 0000000000..c2e4b7e97a --- /dev/null +++ b/sysdeps/x86_64/wcsrchr.S @@ -0,0 +1,283 @@ +/* wcsrchr with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + + .text +ENTRY (wcsrchr) + + movd %rsi, %xmm1 + mov %rdi, %rcx + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + and $63, %rcx + cmp $48, %rcx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rcx + pmovmskb %xmm0, %rax + add $16, %rdi + + test %rax, %rax + jnz L(unaligned_match1) + + test %rcx, %rcx + jnz L(return_null) + + and $-16, %rdi + xor %r8, %r8 + jmp L(loop) + + .p2align 4 +L(unaligned_match1): + test %rcx, %rcx + jnz L(prolog_find_zero_1) + + mov %rax, %r8 + mov %rdi, %rsi + and $-16, %rdi + jmp L(loop) + + .p2align 4 +L(crosscache): + and $15, %rcx + and $-16, %rdi + pxor %xmm3, %xmm3 + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm3 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm3, %rdx + pmovmskb %xmm0, %rax + shr %cl, %rdx + shr %cl, %rax + add $16, %rdi + + test %rax, %rax + jnz L(unaligned_match) + + test %rdx, %rdx + jnz L(return_null) + + xor %r8, %r8 + jmp L(loop) + + .p2align 4 +L(unaligned_match): + test %rdx, %rdx + jnz L(prolog_find_zero) + + mov %rax, %r8 + lea (%rdi, %rcx), %rsi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rcx + pmovmskb %xmm0, %rax + or %rax, %rcx + jnz L(matches) + + movdqa (%rdi), %xmm3 + pcmpeqd %xmm3, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm2, %rcx + pmovmskb %xmm3, %rax + or %rax, %rcx + jnz L(matches) + + movdqa (%rdi), %xmm4 + pcmpeqd %xmm4, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm4 + pmovmskb %xmm2, %rcx + pmovmskb %xmm4, %rax + or %rax, %rcx + jnz L(matches) + + movdqa (%rdi), %xmm5 + pcmpeqd %xmm5, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm5 + pmovmskb %xmm2, %rcx + pmovmskb %xmm5, %rax + or %rax, %rcx + jz L(loop) + + .p2align 4 +L(matches): + test %rax, %rax + jnz L(match) +L(return_value): + test %r8, %r8 + jz L(return_null) + mov %r8, %rax + mov %rsi, %rdi + + test $15 << 4, %ah + jnz L(match_fourth_wchar) + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(match): + pmovmskb %xmm2, %rcx + test %rcx, %rcx + jnz L(find_zero) + mov %rax, %r8 + mov %rdi, %rsi + jmp L(loop) + + .p2align 4 +L(find_zero): + test $15, %cl + jnz L(find_zero_in_first_wchar) + test %cl, %cl + jnz L(find_zero_in_second_wchar) + test $15, %ch + jnz L(find_zero_in_third_wchar) + + and $1 << 13 - 1, %rax + jz L(return_value) + + test $15 << 4, %ah + jnz L(match_fourth_wchar) + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(find_zero_in_first_wchar): + test $1, %rax + jz L(return_value) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(find_zero_in_second_wchar): + and $1 << 5 - 1, %rax + jz L(return_value) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(find_zero_in_third_wchar): + and $1 << 9 - 1, %rax + jz L(return_value) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero): + add %rcx, %rdi + mov %rdx, %rcx +L(prolog_find_zero_1): + test $15, %cl + jnz L(prolog_find_zero_in_first_wchar) + test %cl, %cl + jnz L(prolog_find_zero_in_second_wchar) + test $15, %ch + jnz L(prolog_find_zero_in_third_wchar) + + and $1 << 13 - 1, %rax + jz L(return_null) + + test $15 << 4, %ah + jnz L(match_fourth_wchar) + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero_in_first_wchar): + test $1, %rax + jz L(return_null) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero_in_second_wchar): + and $1 << 5 - 1, %rax + jz L(return_null) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero_in_third_wchar): + and $1 << 9 - 1, %rax + jz L(return_null) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(match_second_wchar): + lea -12(%rdi), %rax + ret + + .p2align 4 +L(match_third_wchar): + lea -8(%rdi), %rax + ret + + .p2align 4 +L(match_fourth_wchar): + lea -4(%rdi), %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +END (wcsrchr) |