diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2017-06-05 07:41:14 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2017-06-05 07:41:26 -0700 |
commit | 7395928b957ebb35afb696c3278d14122aa97b51 (patch) | |
tree | ca45794e0ad69d0e4d03bd94d36d6afcb54edd6b | |
parent | d8a7d10324d9765fa62f42c1d94c5bf36b60d558 (diff) | |
download | glibc-7395928b957ebb35afb696c3278d14122aa97b51.tar.gz glibc-7395928b957ebb35afb696c3278d14122aa97b51.tar.xz glibc-7395928b957ebb35afb696c3278d14122aa97b51.zip |
x86_64: Remove redundant REX bytes from memrchr.S
By x86-64 specification, 32-bit destination registers are zero-extended to 64 bits. There is no need to use 64-bit registers when only the lower 32 bits are non-zero. Also 2 instructions in: mov %rdi, %rcx and $15, %rcx jz L(length_less16_offset0) mov %rdi, %rcx <<< redundant and $15, %rcx <<< redundant are redundant. * sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for the lower 32 bits. Remove redundant instructions.
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | sysdeps/x86_64/memrchr.S | 36 |
2 files changed, 22 insertions, 19 deletions
diff --git a/ChangeLog b/ChangeLog index 1cbcf564e0..1549eb6422 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,10 @@ 2017-06-05 H.J. Lu <hongjiu.lu@intel.com> + * sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for + the lower 32 bits. Remove redundant instructions. + +2017-06-05 H.J. Lu <hongjiu.lu@intel.com> + * sysdeps/unix/sysv/linux/x86_64/sysdep.h (LO_HI_LONG): Pass 0 as the high part of offset. * sysdeps/unix/sysv/linux/x86_64/x32/sysdep.h (LO_HI_LONG): New. diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S index aab1a4a0c4..5fa0fe9c1c 100644 --- a/sysdeps/x86_64/memrchr.S +++ b/sysdeps/x86_64/memrchr.S @@ -22,7 +22,7 @@ .text ENTRY (__memrchr) - movd %rsi, %xmm1 + movd %esi, %xmm1 sub $16, %rdx jbe L(length_less16) @@ -42,8 +42,8 @@ ENTRY (__memrchr) jnz L(matches0) sub $64, %rdi - mov %rdi, %rcx - and $15, %rcx + mov %edi, %ecx + and $15, %ecx jz L(loop_prolog) add $16, %rdi @@ -108,8 +108,8 @@ L(loop_prolog): test %eax, %eax jnz L(matches0) - mov %rdi, %rcx - and $63, %rcx + mov %edi, %ecx + and $63, %ecx jz L(align64_loop) add $64, %rdi @@ -166,8 +166,8 @@ L(align64_loop): .p2align 4 L(exit_loop): - add $64, %rdx - cmp $32, %rdx + add $64, %edx + cmp $32, %edx jbe L(exit_loop_32) movdqa 48(%rdi), %xmm0 @@ -187,7 +187,7 @@ L(exit_loop): pmovmskb %xmm3, %eax test %eax, %eax jnz L(matches16_1) - cmp $48, %rdx + cmp $48, %edx jbe L(return_null) pcmpeqb (%rdi), %xmm1 @@ -204,7 +204,7 @@ L(exit_loop_32): pmovmskb %xmm0, %eax test %eax, %eax jnz L(matches48_1) - cmp $16, %rdx + cmp $16, %edx jbe L(return_null) pcmpeqb 32(%rdi), %xmm1 @@ -276,7 +276,7 @@ L(matches48_1): .p2align 4 L(return_null): - xor %rax, %rax + xor %eax, %eax ret .p2align 4 @@ -306,18 +306,16 @@ L(length_less16): punpcklbw %xmm1, %xmm1 punpcklbw %xmm1, %xmm1 - add $16, %rdx + add $16, %edx pshufd $0, %xmm1, %xmm1 - mov %rdi, %rcx - and $15, %rcx + mov %edi, %ecx + and $15, %ecx jz L(length_less16_offset0) - mov %rdi, %rcx - and $15, %rcx mov %cl, %dh - mov %rcx, %r8 + mov %ecx, %esi add %dl, %dh and $-16, %rdi @@ -340,7 +338,7 @@ L(length_less16): bsr %eax, %eax add %rdi, %rax - add %r8, %rax + add %rsi, %rax ret .p2align 4 @@ -362,14 +360,14 @@ L(length_less16_part2): pcmpeqb (%rdi), %xmm1 pmovmskb %xmm1, %eax - mov %r8, %rcx + mov %esi, %ecx sar %cl, %eax test %eax, %eax jz L(return_null) bsr %eax, %eax add %rdi, %rax - add %r8, %rax + add %rsi, %rax ret .p2align 4 |