diff options
author | H.J. Lu <hjl.tools@gmail.com> | 2015-08-25 08:48:21 -0700 |
---|---|---|
committer | H.J. Lu <hjl.tools@gmail.com> | 2015-08-25 08:48:34 -0700 |
commit | 5f92ec52e795dc004f8e8d17317e4572695ded15 (patch) | |
tree | c99137d1931efb6586fcd1eff0d36a8c227133b0 | |
parent | 4bd228c8a6b5b0c64dfac6febf7333e47e42ea26 (diff) | |
download | glibc-5f92ec52e795dc004f8e8d17317e4572695ded15.tar.gz glibc-5f92ec52e795dc004f8e8d17317e4572695ded15.tar.xz glibc-5f92ec52e795dc004f8e8d17317e4572695ded15.zip |
Replace %xmm8 with %xmm0
Since ld.so preserves vector registers now, we can use %xmm0 to avoid the REX prefix. * sysdeps/x86_64/memset.S: Replace %xmm8 with %xmm0.
-rw-r--r-- | ChangeLog | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/memset.S | 52 |
2 files changed, 30 insertions, 26 deletions
diff --git a/ChangeLog b/ChangeLog index 73645c3773..362aceebc9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +2015-08-25 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/x86_64/memset.S: Replace %xmm8 with %xmm0. + 2015-08-25 Ondřej Bílka <neleai@seznam.cz> * debug/strcpy_chk.c: Improve performance. diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index e4962546c4..3855cc88b5 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -24,7 +24,7 @@ ENTRY(__bzero) movq %rdi, %rax /* Set return value. */ movq %rsi, %rdx /* Set n. */ - pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 jmp L(entry_from_bzero) END(__bzero) weak_alias (__bzero, bzero) @@ -33,10 +33,10 @@ weak_alias (__bzero, bzero) ENTRY(__memset_tail) movq %rcx, %rax /* Set return value. */ - movd %esi, %xmm8 - punpcklbw %xmm8, %xmm8 - punpcklwd %xmm8, %xmm8 - pshufd $0, %xmm8, %xmm8 + movd %esi, %xmm0 + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 jmp L(entry_from_bzero) END(__memset_tail) @@ -50,57 +50,57 @@ END_CHK (__memset_chk) #endif ENTRY (memset) - movd %esi, %xmm8 + movd %esi, %xmm0 movq %rdi, %rax - punpcklbw %xmm8, %xmm8 - punpcklwd %xmm8, %xmm8 - pshufd $0, %xmm8, %xmm8 + punpcklbw %xmm0, %xmm0 + punpcklwd %xmm0, %xmm0 + pshufd $0, %xmm0, %xmm0 L(entry_from_bzero): cmpq $64, %rdx ja L(loop_start) cmpq $16, %rdx jbe L(less_16_bytes) cmpq $32, %rdx - movdqu %xmm8, (%rdi) - movdqu %xmm8, -16(%rdi,%rdx) + movdqu %xmm0, (%rdi) + movdqu %xmm0, -16(%rdi,%rdx) ja L(between_32_64_bytes) L(return): rep ret .p2align 4 L(between_32_64_bytes): - movdqu %xmm8, 16(%rdi) - movdqu %xmm8, -32(%rdi,%rdx) + movdqu %xmm0, 16(%rdi) + movdqu %xmm0, -32(%rdi,%rdx) ret .p2align 4 L(loop_start): leaq 64(%rdi), %rcx - movdqu %xmm8, (%rdi) + movdqu %xmm0, (%rdi) andq $-64, %rcx - movdqu %xmm8, -16(%rdi,%rdx) - movdqu %xmm8, 16(%rdi) - movdqu %xmm8, -32(%rdi,%rdx) - movdqu %xmm8, 32(%rdi) - movdqu %xmm8, -48(%rdi,%rdx) - movdqu %xmm8, 48(%rdi) - movdqu %xmm8, -64(%rdi,%rdx) + movdqu %xmm0, -16(%rdi,%rdx) + movdqu %xmm0, 16(%rdi) + movdqu %xmm0, -32(%rdi,%rdx) + movdqu %xmm0, 32(%rdi) + movdqu %xmm0, -48(%rdi,%rdx) + movdqu %xmm0, 48(%rdi) + movdqu %xmm0, -64(%rdi,%rdx) addq %rdi, %rdx andq $-64, %rdx cmpq %rdx, %rcx je L(return) .p2align 4 L(loop): - movdqa %xmm8, (%rcx) - movdqa %xmm8, 16(%rcx) - movdqa %xmm8, 32(%rcx) - movdqa %xmm8, 48(%rcx) + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + movdqa %xmm0, 32(%rcx) + movdqa %xmm0, 48(%rcx) addq $64, %rcx cmpq %rcx, %rdx jne L(loop) rep ret L(less_16_bytes): - movq %xmm8, %rcx + movq %xmm0, %rcx testb $24, %dl jne L(between8_16bytes) testb $4, %dl |