diff options
author | Ulrich Drepper <drepper@gmail.com> | 2011-12-17 14:39:23 -0500 |
---|---|---|
committer | Ulrich Drepper <drepper@gmail.com> | 2011-12-17 14:39:23 -0500 |
commit | 1d3e4b618ae0217f1736753f3085f9c4fcc827bf (patch) | |
tree | 90a3f8d19f941a684e1482b8813c534d82cfb19e /sysdeps/x86_64 | |
parent | a2d18b64edb486825fb5946eefc2131426ccfec9 (diff) | |
download | glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.gz glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.xz glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.zip |
Optimized wcschr and wcscpy for x86-64 and x86-32
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 6 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wcscpy-c.c | 5 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wcscpy-ssse3.S | 566 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wcscpy.S | 43 | ||||
-rw-r--r-- | sysdeps/x86_64/wcschr.S | 155 | ||||
-rw-r--r-- | sysdeps/x86_64/wcsrchr.S | 283 |
6 files changed, 1057 insertions, 1 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 4cf4cf4b28..9a183f068e 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strcat-sse2-unaligned strncat-sse2-unaligned \ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \ - memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c + memcmp-ssse3 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 @@ -28,3 +28,7 @@ CFLAGS-strcasestr.c += -msse4 CFLAGS-strcasestr-nonascii.c += -msse4 endif endif + +ifeq ($(subdir),wcsmbs) +sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c +endif diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c new file mode 100644 index 0000000000..f27c069198 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscpy-c.c @@ -0,0 +1,5 @@ +#ifndef NOT_IN_libc +# define wcscpy __wcscpy_sse2 +#endif + +#include "wcsmbs/wcscpy.c" diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S new file mode 100644 index 0000000000..133700fbe4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -0,0 +1,566 @@ +/* wcscpy with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc +# include <sysdep.h> + +.text +ENTRY (__wcscpy_ssse3) + mov %rsi, %rcx + mov %rdi, %rdx + + cmpl $0, (%rcx) + jz L(Exit4) + cmpl $0, 4(%rcx) + jz L(Exit8) + cmpl $0, 8(%rcx) + jz L(Exit12) + cmpl $0, 12(%rcx) + jz L(Exit16) + + lea 16(%rcx), %rsi + and $-16, %rsi + + pxor %xmm0, %xmm0 + mov (%rcx), %r9 + mov %r9, (%rdx) + + pcmpeqd (%rsi), %xmm0 + mov 8(%rcx), %r9 + mov %r9, 8(%rdx) + + pmovmskb %xmm0, %rax + sub %rcx, %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov %rdx, %rax + lea 16(%rdx), %rdx + and $-16, %rdx + sub %rdx, %rax + sub %rax, %rcx + mov %rcx, %rax + and $0xf, %rax + mov $0, %rsi + +/* case: rcx_offset == rdx_offset */ + + jz L(Align16Both) + + cmp $4, %rax + je L(Shl4) + cmp $8, %rax + je L(Shl8) + jmp L(Shl12) + +L(Align16Both): + movaps (%rcx), %xmm1 + movaps 16(%rcx), %xmm2 + movaps %xmm1, (%rdx) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm4 + movaps %xmm3, (%rdx, %rsi) + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm1 + movaps %xmm4, (%rdx, %rsi) + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm2 + movaps %xmm1, (%rdx, %rsi) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%rdx, %rsi) + mov %rcx, %rax + lea 16(%rcx, %rsi), %rcx + and $-0x40, %rcx + sub %rcx, %rax + sub %rax, %rdx + + mov $-0x40, %rsi + +L(Aligned64Loop): + movaps (%rcx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rcx), %xmm5 + movaps 32(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rcx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqd %xmm0, %xmm3 + pmovmskb %xmm3, %rax + lea 64(%rdx), %rdx + lea 64(%rcx), %rcx + test %rax, %rax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%rdx) + movaps %xmm5, -48(%rdx) + movaps %xmm6, -32(%rdx) + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %rax + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm5, %xmm0 + + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm6, %xmm0 + + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%rdx) + pcmpeqd %xmm7, %xmm0 + + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov $-0x40, %rsi + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + + .p2align 4 +L(Shl4): + movaps -4(%rcx), %xmm1 + movaps 12(%rcx), %xmm2 +L(Shl4Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 28(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -12(%rcx), %rcx + sub %rax, %rdx + + movaps -4(%rcx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%rcx), %xmm2 + movaps 28(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %rax, %rax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) + + palignr $4, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movaps (%rdx), %xmm6 + psrldq $12, %xmm6 + mov $12, %rsi + palignr $4, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%rcx), %xmm1 + movaps 8(%rcx), %xmm2 +L(Shl8Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 24(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -8(%rcx), %rcx + sub %rax, %rdx + + movaps -8(%rcx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%rcx), %xmm2 + movaps 24(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %rax, %rax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) + + palignr $8, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movaps (%rdx), %xmm6 + psrldq $8, %xmm6 + mov $8, %rsi + palignr $8, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%rcx), %xmm1 + movaps 4(%rcx), %xmm2 +L(Shl12Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%rdx) + lea 20(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -4(%rcx), %rcx + sub %rax, %rdx + + movaps -12(%rcx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%rcx), %xmm2 + movaps 20(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %rax, %rax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) + palignr $12, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movaps (%rdx), %xmm6 + psrldq $4, %xmm6 + mov $4, %rsi + palignr $12, %xmm1, %xmm6 + movaps %xmm6, (%rdx) + + .p2align 4 +L(CopyFrom1To16Bytes): + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + + mov (%rcx), %rax + mov %rax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit12) + + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit4): + movl (%rcx), %eax + movl %eax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit8): + mov (%rcx), %rax + mov %rax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit12): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit16): + movdqu (%rcx), %xmm0 + movdqu %xmm0, (%rdx) + mov %rdi, %rax + ret + +END(__wcscpy_ssse3) +#endif + diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S new file mode 100644 index 0000000000..818c5549e6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscpy.S @@ -0,0 +1,43 @@ +/* Multiple versions of wcscpy + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + + .text +ENTRY(wcscpy) + .type wcscpy, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features + +1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip) + jnz 2f + leaq __wcscpy_sse2(%rip), %rax + ret + +2: leaq __wcscpy_ssse3(%rip), %rax + ret + +END(wcscpy) +#endif diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S new file mode 100644 index 0000000000..b3a1b3b713 --- /dev/null +++ b/sysdeps/x86_64/wcschr.S @@ -0,0 +1,155 @@ +/* wcschr with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + + .text +ENTRY (wcschr) + + movd %rsi, %xmm1 + pxor %xmm2, %xmm2 + mov %rdi, %rcx + punpckldq %xmm1, %xmm1 + punpckldq %xmm1, %xmm1 + + and $63, %rcx + cmp $48, %rcx + ja L(cross_cache) + + movdqu (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + and $-16, %rdi + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + jmp L(loop) + +L(cross_cache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + + sar %cl, %rdx + sar %cl, %rax + test %rax, %rax + je L(unaligned_no_match) + + bsf %rax, %rax + test %rdx, %rdx + je L(unaligned_match) + bsf %rdx, %rdx + cmp %rdx, %rax + ja L(return_null) + +L(unaligned_match): + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + test %rdx, %rdx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %rdi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + jmp L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %rdx + test %rax, %rax + jz L(return_null) + bsf %rax, %rax + test %rdx, %rdx + je L(match) + bsf %rdx, %rcx + cmp %rcx, %rax + ja L(return_null) +L(match): + sub $16, %rdi + add %rdi, %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +END (wcschr) + +libc_hidden_def(wcschr) diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S new file mode 100644 index 0000000000..c2e4b7e97a --- /dev/null +++ b/sysdeps/x86_64/wcsrchr.S @@ -0,0 +1,283 @@ +/* wcsrchr with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + + .text +ENTRY (wcsrchr) + + movd %rsi, %xmm1 + mov %rdi, %rcx + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + and $63, %rcx + cmp $48, %rcx + ja L(crosscache) + + movdqu (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rcx + pmovmskb %xmm0, %rax + add $16, %rdi + + test %rax, %rax + jnz L(unaligned_match1) + + test %rcx, %rcx + jnz L(return_null) + + and $-16, %rdi + xor %r8, %r8 + jmp L(loop) + + .p2align 4 +L(unaligned_match1): + test %rcx, %rcx + jnz L(prolog_find_zero_1) + + mov %rax, %r8 + mov %rdi, %rsi + and $-16, %rdi + jmp L(loop) + + .p2align 4 +L(crosscache): + and $15, %rcx + and $-16, %rdi + pxor %xmm3, %xmm3 + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm3 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm3, %rdx + pmovmskb %xmm0, %rax + shr %cl, %rdx + shr %cl, %rax + add $16, %rdi + + test %rax, %rax + jnz L(unaligned_match) + + test %rdx, %rdx + jnz L(return_null) + + xor %r8, %r8 + jmp L(loop) + + .p2align 4 +L(unaligned_match): + test %rdx, %rdx + jnz L(prolog_find_zero) + + mov %rax, %r8 + lea (%rdi, %rcx), %rsi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rcx + pmovmskb %xmm0, %rax + or %rax, %rcx + jnz L(matches) + + movdqa (%rdi), %xmm3 + pcmpeqd %xmm3, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm2, %rcx + pmovmskb %xmm3, %rax + or %rax, %rcx + jnz L(matches) + + movdqa (%rdi), %xmm4 + pcmpeqd %xmm4, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm4 + pmovmskb %xmm2, %rcx + pmovmskb %xmm4, %rax + or %rax, %rcx + jnz L(matches) + + movdqa (%rdi), %xmm5 + pcmpeqd %xmm5, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm5 + pmovmskb %xmm2, %rcx + pmovmskb %xmm5, %rax + or %rax, %rcx + jz L(loop) + + .p2align 4 +L(matches): + test %rax, %rax + jnz L(match) +L(return_value): + test %r8, %r8 + jz L(return_null) + mov %r8, %rax + mov %rsi, %rdi + + test $15 << 4, %ah + jnz L(match_fourth_wchar) + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(match): + pmovmskb %xmm2, %rcx + test %rcx, %rcx + jnz L(find_zero) + mov %rax, %r8 + mov %rdi, %rsi + jmp L(loop) + + .p2align 4 +L(find_zero): + test $15, %cl + jnz L(find_zero_in_first_wchar) + test %cl, %cl + jnz L(find_zero_in_second_wchar) + test $15, %ch + jnz L(find_zero_in_third_wchar) + + and $1 << 13 - 1, %rax + jz L(return_value) + + test $15 << 4, %ah + jnz L(match_fourth_wchar) + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(find_zero_in_first_wchar): + test $1, %rax + jz L(return_value) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(find_zero_in_second_wchar): + and $1 << 5 - 1, %rax + jz L(return_value) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(find_zero_in_third_wchar): + and $1 << 9 - 1, %rax + jz L(return_value) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero): + add %rcx, %rdi + mov %rdx, %rcx +L(prolog_find_zero_1): + test $15, %cl + jnz L(prolog_find_zero_in_first_wchar) + test %cl, %cl + jnz L(prolog_find_zero_in_second_wchar) + test $15, %ch + jnz L(prolog_find_zero_in_third_wchar) + + and $1 << 13 - 1, %rax + jz L(return_null) + + test $15 << 4, %ah + jnz L(match_fourth_wchar) + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero_in_first_wchar): + test $1, %rax + jz L(return_null) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero_in_second_wchar): + and $1 << 5 - 1, %rax + jz L(return_null) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(prolog_find_zero_in_third_wchar): + and $1 << 9 - 1, %rax + jz L(return_null) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%rdi), %rax + ret + + .p2align 4 +L(match_second_wchar): + lea -12(%rdi), %rax + ret + + .p2align 4 +L(match_third_wchar): + lea -8(%rdi), %rax + ret + + .p2align 4 +L(match_fourth_wchar): + lea -4(%rdi), %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +END (wcsrchr) |