Optimized wcschr and wcscpy for x86-64 and x86-32

author: Ulrich Drepper <drepper@gmail.com> 2011-12-17 14:39:23 -0500
committer: Ulrich Drepper <drepper@gmail.com> 2011-12-17 14:39:23 -0500
commit: 1d3e4b618ae0217f1736753f3085f9c4fcc827bf (patch)
tree: 90a3f8d19f941a684e1482b8813c534d82cfb19e /sysdeps/x86_64
parent: a2d18b64edb486825fb5946eefc2131426ccfec9 (diff)
download: glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.gz
glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.xz
glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.zip
6 files changed, 1057 insertions, 1 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 4cf4cf4b28..9a183f068e 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
 		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
-		   memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c
+		   memcmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
@@ -28,3 +28,7 @@ CFLAGS-strcasestr.c += -msse4
 CFLAGS-strcasestr-nonascii.c += -msse4
 endif
 endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
+endif
diff --git a/sysdeps/x86_64/multiarch/wcscpy-c.c b/sysdeps/x86_64/multiarch/wcscpy-c.c
new file mode 100644
index 0000000000..f27c069198
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-c.c
@@ -0,0 +1,5 @@
+#ifndef NOT_IN_libc
+# define wcscpy  __wcscpy_sse2
+#endif
+
+#include "wcsmbs/wcscpy.c"
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
new file mode 100644
index 0000000000..133700fbe4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -0,0 +1,566 @@
+/* wcscpy with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef NOT_IN_libc
+# include <sysdep.h>
+
+.text
+ENTRY (__wcscpy_ssse3)
+	mov	%rsi, %rcx
+	mov	%rdi, %rdx
+
+	cmpl	$0, (%rcx)
+	jz	L(Exit4)
+	cmpl	$0, 4(%rcx)
+	jz	L(Exit8)
+	cmpl	$0, 8(%rcx)
+	jz	L(Exit12)
+	cmpl	$0, 12(%rcx)
+	jz	L(Exit16)
+
+	lea	16(%rcx), %rsi
+	and	$-16, %rsi
+
+	pxor	%xmm0, %xmm0
+	mov	(%rcx), %r9
+	mov	%r9, (%rdx)
+
+	pcmpeqd	(%rsi), %xmm0
+	mov	8(%rcx), %r9
+	mov	%r9, 8(%rdx)
+
+	pmovmskb %xmm0, %rax
+	sub	%rcx, %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	%rdx, %rax
+	lea	16(%rdx), %rdx
+	and	$-16, %rdx
+	sub	%rdx, %rax
+	sub	%rax, %rcx
+	mov	%rcx, %rax
+	and	$0xf, %rax
+	mov	$0, %rsi
+
+/* case: rcx_offset == rdx_offset */
+
+	jz	L(Align16Both)
+
+	cmp	$4, %rax
+	je	L(Shl4)
+	cmp	$8, %rax
+	je	L(Shl8)
+	jmp	L(Shl12)
+
+L(Align16Both):
+	movaps	(%rcx), %xmm1
+	movaps	16(%rcx), %xmm2
+	movaps	%xmm1, (%rdx)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm4
+	movaps	%xmm3, (%rdx, %rsi)
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm1
+	movaps	%xmm4, (%rdx, %rsi)
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm2
+	movaps	%xmm1, (%rdx, %rsi)
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	16(%rcx, %rsi), %xmm3
+	movaps	%xmm2, (%rdx, %rsi)
+	pcmpeqd	%xmm3, %xmm0
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm3, (%rdx, %rsi)
+	mov	%rcx, %rax
+	lea	16(%rcx, %rsi), %rcx
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	sub	%rax, %rdx
+
+	mov	$-0x40, %rsi
+
+L(Aligned64Loop):
+	movaps	(%rcx), %xmm2
+	movaps	%xmm2, %xmm4
+	movaps	16(%rcx), %xmm5
+	movaps	32(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	48(%rcx), %xmm7
+	pminub	%xmm5, %xmm2
+	pminub	%xmm7, %xmm3
+	pminub	%xmm2, %xmm3
+	pcmpeqd	%xmm0, %xmm3
+	pmovmskb %xmm3, %rax
+	lea	64(%rdx), %rdx
+	lea	64(%rcx), %rcx
+	test	%rax, %rax
+	jnz	L(Aligned64Leave)
+	movaps	%xmm4, -64(%rdx)
+	movaps	%xmm5, -48(%rdx)
+	movaps	%xmm6, -32(%rdx)
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+L(Aligned64Leave):
+	pcmpeqd	%xmm4, %xmm0
+	pmovmskb %xmm0, %rax
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm5, %xmm0
+
+	pmovmskb %xmm0, %rax
+	movaps	%xmm4, -64(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	pcmpeqd	%xmm6, %xmm0
+
+	pmovmskb %xmm0, %rax
+	movaps	%xmm5, -48(%rdx)
+	test	%rax, %rax
+	lea	16(%rsi), %rsi
+	jnz	L(CopyFrom1To16Bytes)
+
+	movaps	%xmm6, -32(%rdx)
+	pcmpeqd	%xmm7, %xmm0
+
+	pmovmskb %xmm0, %rax
+	lea	16(%rsi), %rsi
+	test	%rax, %rax
+	jnz	L(CopyFrom1To16Bytes)
+
+	mov	$-0x40, %rsi
+	movaps	%xmm7, -16(%rdx)
+	jmp	L(Aligned64Loop)
+
+	.p2align 4
+L(Shl4):
+	movaps	-4(%rcx), %xmm1
+	movaps	12(%rcx), %xmm2
+L(Shl4Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	28(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+
+	test	%rax, %rax
+	jnz	L(Shl4LoopExit)
+
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	28(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-12(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-4(%rcx), %xmm1
+
+L(Shl4LoopStart):
+	movaps	12(%rcx), %xmm2
+	movaps	28(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	44(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	60(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$4, %xmm3, %xmm4
+	jnz	L(Shl4Start)
+
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl4LoopStart)
+
+L(Shl4LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$12, %xmm6
+	mov	$12, %rsi
+	palignr	$4, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl8):
+	movaps	-8(%rcx), %xmm1
+	movaps	8(%rcx), %xmm2
+L(Shl8Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	24(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl8LoopExit)
+
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	24(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-8(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-8(%rcx), %xmm1
+
+L(Shl8LoopStart):
+	movaps	8(%rcx), %xmm2
+	movaps	24(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	40(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	56(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$8, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$8, %xmm3, %xmm4
+	jnz	L(Shl8Start)
+
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl8LoopStart)
+
+L(Shl8LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$8, %xmm6
+	mov	$8, %rsi
+	palignr	$8, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+	jmp	L(CopyFrom1To16Bytes)
+
+	.p2align 4
+L(Shl12):
+	movaps	-12(%rcx), %xmm1
+	movaps	4(%rcx), %xmm2
+L(Shl12Start):
+	pcmpeqd	%xmm2, %xmm0
+	pmovmskb %xmm0, %rax
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+	movaps	%xmm3, %xmm1
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	movaps	20(%rcx), %xmm2
+
+	pcmpeqd	%xmm2, %xmm0
+	lea	16(%rdx), %rdx
+	pmovmskb %xmm0, %rax
+	lea	16(%rcx), %rcx
+	movaps	%xmm2, %xmm3
+
+	test	%rax, %rax
+	jnz	L(Shl12LoopExit)
+
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm3, %xmm1
+	movaps	%xmm2, (%rdx)
+	lea	20(%rcx), %rcx
+	lea	16(%rdx), %rdx
+
+	mov	%rcx, %rax
+	and	$-0x40, %rcx
+	sub	%rcx, %rax
+	lea	-4(%rcx), %rcx
+	sub	%rax, %rdx
+
+	movaps	-12(%rcx), %xmm1
+
+L(Shl12LoopStart):
+	movaps	4(%rcx), %xmm2
+	movaps	20(%rcx), %xmm3
+	movaps	%xmm3, %xmm6
+	movaps	36(%rcx), %xmm4
+	movaps	%xmm4, %xmm7
+	movaps	52(%rcx), %xmm5
+	pminub	%xmm2, %xmm6
+	pminub	%xmm5, %xmm7
+	pminub	%xmm6, %xmm7
+	pcmpeqd	%xmm0, %xmm7
+	pmovmskb %xmm7, %rax
+	movaps	%xmm5, %xmm7
+	palignr	$12, %xmm4, %xmm5
+	test	%rax, %rax
+	palignr	$12, %xmm3, %xmm4
+	jnz	L(Shl12Start)
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rcx), %rcx
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm7, %xmm1
+	movaps	%xmm5, 48(%rdx)
+	movaps	%xmm4, 32(%rdx)
+	movaps	%xmm3, 16(%rdx)
+	movaps	%xmm2, (%rdx)
+	lea	64(%rdx), %rdx
+	jmp	L(Shl12LoopStart)
+
+L(Shl12LoopExit):
+	movaps	(%rdx), %xmm6
+	psrldq	$4, %xmm6
+	mov	$4, %rsi
+	palignr	$12, %xmm1, %xmm6
+	movaps	%xmm6, (%rdx)
+
+	.p2align 4
+L(CopyFrom1To16Bytes):
+	add	%rsi, %rdx
+	add	%rsi, %rcx
+
+	test	%al, %al
+	jz	L(ExitHigh)
+	test	$0x01, %al
+	jnz	L(Exit4)
+
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(ExitHigh):
+	test	$0x01, %ah
+	jnz	L(Exit12)
+
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %rax
+	mov	%rax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit4):
+	movl	(%rcx), %eax
+	movl	%eax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit8):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit12):
+	mov	(%rcx), %rax
+	mov	%rax, (%rdx)
+	mov	8(%rcx), %eax
+	mov	%eax, 8(%rdx)
+	mov	%rdi, %rax
+	ret
+
+	.p2align 4
+L(Exit16):
+	movdqu	(%rcx), %xmm0
+	movdqu	%xmm0, (%rdx)
+	mov	%rdi, %rax
+	ret
+
+END(__wcscpy_ssse3)
+#endif
+
diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S
new file mode 100644
index 0000000000..818c5549e6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscpy.S
@@ -0,0 +1,43 @@
+/* Multiple versions of wcscpy
+   Copyright (C)  2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#ifndef NOT_IN_libc
+
+	.text
+ENTRY(wcscpy)
+	.type	wcscpy, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
+	jne	1f
+	call	__init_cpu_features
+
+1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jnz	2f
+	leaq	__wcscpy_sse2(%rip), %rax
+	ret
+
+2:	leaq	__wcscpy_ssse3(%rip), %rax
+	ret
+
+END(wcscpy)
+#endif
diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S
new file mode 100644
index 0000000000..b3a1b3b713
--- /dev/null
+++ b/sysdeps/x86_64/wcschr.S
@@ -0,0 +1,155 @@
+/* wcschr with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (wcschr)
+
+	movd	%rsi, %xmm1
+	pxor	%xmm2, %xmm2
+	mov	%rdi, %rcx
+	punpckldq %xmm1, %xmm1
+	punpckldq %xmm1, %xmm1
+
+	and	$63, %rcx
+	cmp	$48, %rcx
+	ja	L(cross_cache)
+
+	movdqu	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	and	$-16, %rdi
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	jmp	L(loop)
+
+L(cross_cache):
+	and	$15, %rcx
+	and	$-16, %rdi
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+
+	sar	%cl, %rdx
+	sar	%cl, %rax
+	test	%rax, %rax
+	je	L(unaligned_no_match)
+
+	bsf	%rax, %rax
+	test	%rdx, %rdx
+	je	L(unaligned_match)
+	bsf	%rdx, %rdx
+	cmp	%rdx, %rax
+	ja	L(return_null)
+
+L(unaligned_match):
+	add	%rdi, %rax
+	add	%rcx, %rax
+	ret
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%rdx, %rdx
+	jne	L(return_null)
+	pxor	%xmm2, %xmm2
+
+	add	$16, %rdi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+	jmp	L(loop)
+
+	.p2align 4
+L(matches):
+	pmovmskb %xmm2, %rdx
+	test	%rax, %rax
+	jz	L(return_null)
+	bsf	%rax, %rax
+	test	%rdx, %rdx
+	je	L(match)
+	bsf	%rdx, %rcx
+	cmp	%rcx, %rax
+	ja	L(return_null)
+L(match):
+	sub	$16, %rdi
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%rax, %rax
+	ret
+
+END (wcschr)
+
+libc_hidden_def(wcschr)
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
new file mode 100644
index 0000000000..c2e4b7e97a
--- /dev/null
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -0,0 +1,283 @@
+/* wcsrchr with SSSE3
+   Copyright (C) 2011 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+	.text
+ENTRY (wcsrchr)
+
+	movd	%rsi, %xmm1
+	mov	%rdi, %rcx
+	punpckldq %xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	punpckldq %xmm1, %xmm1
+	and	$63, %rcx
+	cmp	$48, %rcx
+	ja	L(crosscache)
+
+	movdqu	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm0, %rax
+	add	$16, %rdi
+
+	test	%rax, %rax
+	jnz	L(unaligned_match1)
+
+	test	%rcx, %rcx
+	jnz	L(return_null)
+
+	and	$-16, %rdi
+	xor	%r8, %r8
+	jmp	L(loop)
+
+	.p2align 4
+L(unaligned_match1):
+	test	%rcx, %rcx
+	jnz	L(prolog_find_zero_1)
+
+	mov	%rax, %r8
+	mov	%rdi, %rsi
+	and	$-16, %rdi
+	jmp	L(loop)
+
+	.p2align 4
+L(crosscache):
+	and	$15, %rcx
+	and	$-16, %rdi
+	pxor	%xmm3, %xmm3
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm3
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm3, %rdx
+	pmovmskb %xmm0, %rax
+	shr	%cl, %rdx
+	shr	%cl, %rax
+	add	$16, %rdi
+
+	test	%rax, %rax
+	jnz	L(unaligned_match)
+
+	test	%rdx, %rdx
+	jnz	L(return_null)
+
+	xor	%r8, %r8
+	jmp	L(loop)
+
+	.p2align 4
+L(unaligned_match):
+	test	%rdx, %rdx
+	jnz	L(prolog_find_zero)
+
+	mov	%rax, %r8
+	lea	(%rdi, %rcx), %rsi
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm3
+	pcmpeqd	%xmm3, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm3, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm4
+	pcmpeqd	%xmm4, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm4
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm4, %rax
+	or	%rax, %rcx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm5
+	pcmpeqd	%xmm5, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm5
+	pmovmskb %xmm2, %rcx
+	pmovmskb %xmm5, %rax
+	or	%rax, %rcx
+	jz	L(loop)
+
+	.p2align 4
+L(matches):
+	test	%rax, %rax
+	jnz	L(match)
+L(return_value):
+	test	%r8, %r8
+	jz	L(return_null)
+	mov	%r8, %rax
+	mov	%rsi, %rdi
+
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match):
+	pmovmskb %xmm2, %rcx
+	test	%rcx, %rcx
+	jnz	L(find_zero)
+	mov	%rax, %r8
+	mov	%rdi, %rsi
+	jmp	L(loop)
+
+	.p2align 4
+L(find_zero):
+	test	$15, %cl
+	jnz	L(find_zero_in_first_wchar)
+	test	%cl, %cl
+	jnz	L(find_zero_in_second_wchar)
+	test	$15, %ch
+	jnz	L(find_zero_in_third_wchar)
+
+	and	$1 << 13 - 1, %rax
+	jz	L(return_value)
+
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(find_zero_in_first_wchar):
+	test	$1, %rax
+	jz	L(return_value)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %rax
+	jz	L(return_value)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(find_zero_in_third_wchar):
+	and	$1 << 9 - 1, %rax
+	jz	L(return_value)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero):
+	add	%rcx, %rdi
+	mov     %rdx, %rcx
+L(prolog_find_zero_1):
+	test	$15, %cl
+	jnz	L(prolog_find_zero_in_first_wchar)
+	test	%cl, %cl
+	jnz	L(prolog_find_zero_in_second_wchar)
+	test	$15, %ch
+	jnz	L(prolog_find_zero_in_third_wchar)
+
+	and	$1 << 13 - 1, %rax
+	jz	L(return_null)
+
+	test	$15 << 4, %ah
+	jnz	L(match_fourth_wchar)
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_in_first_wchar):
+	test	$1, %rax
+	jz	L(return_null)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_in_second_wchar):
+	and	$1 << 5 - 1, %rax
+	jz	L(return_null)
+
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(prolog_find_zero_in_third_wchar):
+	and	$1 << 9 - 1, %rax
+	jz	L(return_null)
+
+	test	%ah, %ah
+	jnz	L(match_third_wchar)
+	test	$15 << 4, %al
+	jnz	L(match_second_wchar)
+	lea	-16(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_second_wchar):
+	lea	-12(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_third_wchar):
+	lea	-8(%rdi), %rax
+	ret
+
+	.p2align 4
+L(match_fourth_wchar):
+	lea	-4(%rdi), %rax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%rax, %rax
+	ret
+
+END (wcsrchr)
author	Ulrich Drepper <drepper@gmail.com>	2011-12-17 14:39:23 -0500
committer	Ulrich Drepper <drepper@gmail.com>	2011-12-17 14:39:23 -0500
commit	1d3e4b618ae0217f1736753f3085f9c4fcc827bf (patch)
tree	90a3f8d19f941a684e1482b8813c534d82cfb19e /sysdeps/x86_64
parent	a2d18b64edb486825fb5946eefc2131426ccfec9 (diff)
download	glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.gz glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.tar.xz glibc-1d3e4b618ae0217f1736753f3085f9c4fcc827bf.zip