about summary refs log tree commit diff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2017-06-05 07:41:14 -0700
committerH.J. Lu <hjl.tools@gmail.com>2017-06-05 07:41:26 -0700
commit7395928b957ebb35afb696c3278d14122aa97b51 (patch)
treeca45794e0ad69d0e4d03bd94d36d6afcb54edd6b
parentd8a7d10324d9765fa62f42c1d94c5bf36b60d558 (diff)
downloadglibc-7395928b957ebb35afb696c3278d14122aa97b51.tar.gz
glibc-7395928b957ebb35afb696c3278d14122aa97b51.tar.xz
glibc-7395928b957ebb35afb696c3278d14122aa97b51.zip
x86_64: Remove redundant REX bytes from memrchr.S
By x86-64 specification, 32-bit destination registers are zero-extended
to 64 bits.  There is no need to use 64-bit registers when only the lower
32 bits are non-zero.  Also 2 instructions in:

	mov	%rdi, %rcx
	and	$15, %rcx
	jz	L(length_less16_offset0)

	mov	%rdi, %rcx		<<< redundant
	and	$15, %rcx		<<< redundant

are redundant.

	* sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
	the lower 32 bits.  Remove redundant instructions.
-rw-r--r--ChangeLog5
-rw-r--r--sysdeps/x86_64/memrchr.S36
2 files changed, 22 insertions, 19 deletions
diff --git a/ChangeLog b/ChangeLog
index 1cbcf564e0..1549eb6422 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
 2017-06-05  H.J. Lu  <hongjiu.lu@intel.com>
 
+	* sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
+	the lower 32 bits.  Remove redundant instructions.
+
+2017-06-05  H.J. Lu  <hongjiu.lu@intel.com>
+
 	* sysdeps/unix/sysv/linux/x86_64/sysdep.h (LO_HI_LONG): Pass
 	0 as the high part of offset.
 	* sysdeps/unix/sysv/linux/x86_64/x32/sysdep.h (LO_HI_LONG): New.
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index aab1a4a0c4..5fa0fe9c1c 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -22,7 +22,7 @@
 
 	.text
 ENTRY (__memrchr)
-	movd	%rsi, %xmm1
+	movd	%esi, %xmm1
 
 	sub	$16, %rdx
 	jbe	L(length_less16)
@@ -42,8 +42,8 @@ ENTRY (__memrchr)
 	jnz	L(matches0)
 
 	sub	$64, %rdi
-	mov	%rdi, %rcx
-	and	$15, %rcx
+	mov	%edi, %ecx
+	and	$15, %ecx
 	jz	L(loop_prolog)
 
 	add	$16, %rdi
@@ -108,8 +108,8 @@ L(loop_prolog):
 	test	%eax, %eax
 	jnz	L(matches0)
 
-	mov	%rdi, %rcx
-	and	$63, %rcx
+	mov	%edi, %ecx
+	and	$63, %ecx
 	jz	L(align64_loop)
 
 	add	$64, %rdi
@@ -166,8 +166,8 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$64, %rdx
-	cmp	$32, %rdx
+	add	$64, %edx
+	cmp	$32, %edx
 	jbe	L(exit_loop_32)
 
 	movdqa	48(%rdi), %xmm0
@@ -187,7 +187,7 @@ L(exit_loop):
 	pmovmskb	%xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches16_1)
-	cmp	$48, %rdx
+	cmp	$48, %edx
 	jbe	L(return_null)
 
 	pcmpeqb	(%rdi), %xmm1
@@ -204,7 +204,7 @@ L(exit_loop_32):
 	pmovmskb	%xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches48_1)
-	cmp	$16, %rdx
+	cmp	$16, %edx
 	jbe	L(return_null)
 
 	pcmpeqb	32(%rdi), %xmm1
@@ -276,7 +276,7 @@ L(matches48_1):
 
 	.p2align 4
 L(return_null):
-	xor	%rax, %rax
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -306,18 +306,16 @@ L(length_less16):
 	punpcklbw	%xmm1, %xmm1
 	punpcklbw	%xmm1, %xmm1
 
-	add	$16, %rdx
+	add	$16, %edx
 
 	pshufd	$0, %xmm1, %xmm1
 
-	mov	%rdi, %rcx
-	and	$15, %rcx
+	mov	%edi, %ecx
+	and	$15, %ecx
 	jz	L(length_less16_offset0)
 
-	mov	%rdi, %rcx
-	and	$15, %rcx
 	mov	%cl, %dh
-	mov	%rcx, %r8
+	mov	%ecx, %esi
 	add	%dl, %dh
 	and	$-16, %rdi
 
@@ -340,7 +338,7 @@ L(length_less16):
 
 	bsr	%eax, %eax
 	add	%rdi, %rax
-	add	%r8, %rax
+	add	%rsi, %rax
 	ret
 
 	.p2align 4
@@ -362,14 +360,14 @@ L(length_less16_part2):
 	pcmpeqb	(%rdi), %xmm1
 	pmovmskb	%xmm1, %eax
 
-	mov	%r8, %rcx
+	mov	%esi, %ecx
 	sar	%cl, %eax
 	test	%eax, %eax
 	jz	L(return_null)
 
 	bsr	%eax, %eax
 	add	%rdi, %rax
-	add	%r8, %rax
+	add	%rsi, %rax
 	ret
 
 	.p2align 4