about summary refs log tree commit diff
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2009-07-25 19:15:14 -0700
committerUlrich Drepper <drepper@redhat.com>2009-07-25 19:15:14 -0700
commit4e5b5821bf58ddc30d455ee4968623f3334fbe28 (patch)
tree1e3ceb13e48e5210c126639e37ff90a1e7a104cc
parent657317537c09b82a2feb1194fda045f63e3a1222 (diff)
downloadglibc-4e5b5821bf58ddc30d455ee4968623f3334fbe28.tar.gz
glibc-4e5b5821bf58ddc30d455ee4968623f3334fbe28.tar.xz
glibc-4e5b5821bf58ddc30d455ee4968623f3334fbe28.zip
Some some optimizations for x86-64 strcmp.
-rw-r--r--ChangeLog3
-rw-r--r--sysdeps/x86_64/multiarch/strcmp.S13
2 files changed, 7 insertions, 9 deletions
diff --git a/ChangeLog b/ChangeLog
index f47b0897a4..abfe7dbfbc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2009-07-25  Ulrich Drepper  <drepper@redhat.com>
 
+	* sysdeps/x86_64/multiarch/strcmp.S: Some more optimizations for
+	modern processor versions.  Patch by H.J. Lu <hongjiu.lu@intel.com>.
+
 	[BZ #10448]
 	* sysdeps/posix/getaddrinfo.c (gaih_inet): If NSS module contains no
 	callback we must touch the status to avoid using stale value.
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index f9cf943e32..15148e4f7f 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -120,10 +120,8 @@ STRCMP_SSE42:
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
 	cmp	$0x30, %eax
 	ja	LABEL(crosscache)	/* rdi: 16-byte load will cross cache line */
-	movlpd	(%rdi), %xmm1
-	movlpd	(%rsi), %xmm2
-	movhpd	8(%rdi), %xmm1
-	movhpd	8(%rsi), %xmm2
+	movdqu	(%rdi), %xmm1
+	movdqu	(%rsi), %xmm2
 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
@@ -1492,11 +1490,8 @@ LABEL(less16bytes):
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz)
 #endif
-	xor	%ecx, %ecx		/* clear %ecx */
-	xor	%eax, %eax		/* clear %eax */
-
-	movb	(%rsi, %rdx), %cl
-	movb	(%rdi, %rdx), %al
+	movzbl	(%rsi, %rdx), %ecx
+	movzbl	(%rdi, %rdx), %eax
 
 	sub	%ecx, %eax
 	ret