From 4f26ef1b67287d1f2c32865f7d79c13abda81915 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Tue, 30 May 2017 12:39:14 -0700 Subject: x86_64: Remove redundant REX bytes from memchr.S By x86-64 specification, 32-bit destination registers are zero-extended to 64 bits. There is no need to use 64-bit registers when only the lower 32 bits are non-zero. * sysdeps/x86_64/memchr.S (MEMCHR): Use 32-bit registers for the lower 32 bits. --- ChangeLog | 5 +++++ sysdeps/x86_64/memchr.S | 28 ++++++++++++++-------------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/ChangeLog b/ChangeLog index 93f0088ace..92d762e48c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +2017-05-30 H.J. Lu + + * sysdeps/x86_64/memchr.S (memchr): Use 32-bit registers for + the lower 32 bits. + 2017-05-29 Andreas Schwab * sysdeps/m68k/Makefile (ASFLAGS-.o) [$(subdir) = csu && diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S index f82e1c5bf7..d3be012424 100644 --- a/sysdeps/x86_64/memchr.S +++ b/sysdeps/x86_64/memchr.S @@ -22,18 +22,18 @@ .text ENTRY(memchr) - movd %rsi, %xmm1 - mov %rdi, %rcx + movd %esi, %xmm1 + mov %edi, %ecx punpcklbw %xmm1, %xmm1 test %rdx, %rdx jz L(return_null) punpcklbw %xmm1, %xmm1 - and $63, %rcx + and $63, %ecx pshufd $0, %xmm1, %xmm1 - cmp $48, %rcx + cmp $48, %ecx ja L(crosscache) movdqu (%rdi), %xmm0 @@ -45,7 +45,7 @@ ENTRY(memchr) sub $16, %rdx jbe L(return_null) add $16, %rdi - and $15, %rcx + and $15, %ecx and $-16, %rdi add %rcx, %rdx sub $64, %rdx @@ -54,7 +54,7 @@ ENTRY(memchr) .p2align 4 L(crosscache): - and $15, %rcx + and $15, %ecx and $-16, %rdi movdqa (%rdi), %xmm0 @@ -148,7 +148,7 @@ L(loop_prolog): mov %rdi, %rcx and $-64, %rdi - and $63, %rcx + and $63, %ecx add %rcx, %rdx .p2align 4 @@ -200,7 +200,7 @@ L(align64_loop): .p2align 4 L(exit_loop): - add $32, %rdx + add $32, %edx jle L(exit_loop_32) movdqa (%rdi), %xmm0 @@ -220,32 +220,32 @@ L(exit_loop): pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches32_1) - sub $16, %rdx + sub $16, %edx jle L(return_null) pcmpeqb 48(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax jnz L(matches48_1) - xor %rax, %rax + xor %eax, %eax ret .p2align 4 L(exit_loop_32): - add $32, %rdx + add $32, %edx movdqa (%rdi), %xmm0 pcmpeqb %xmm1, %xmm0 pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches_1) - sub $16, %rdx + sub $16, %edx jbe L(return_null) pcmpeqb 16(%rdi), %xmm1 pmovmskb %xmm1, %eax test %eax, %eax jnz L(matches16_1) - xor %rax, %rax + xor %eax, %eax ret .p2align 4 @@ -306,7 +306,7 @@ L(matches48_1): .p2align 4 L(return_null): - xor %rax, %rax + xor %eax, %eax ret END(memchr) -- cgit 1.4.1