From bf2071eda32528ee8b0bb89544152646684a2cf3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 10 Feb 2015 18:30:56 +0100 Subject: x86_64/memset: simple optimizations "and $0xff,%esi" is a six-byte insn (81 e6 ff 00 00 00), can use 4-byte "movzbl %sil,%esi" (40 0f b6 f6) instead. 64-bit imul is slow, move it as far up as possible so that the result (rax) has more time to be ready by the time we start using it in mem stores. There is no need to shuffle registers in preparation to "rep movs" if we are not going to take that code path. Thus, patch moves "jump if len < 16" instructions up, and changes alternate code path to use rdx and rdi instead of rcx and r8. Signed-off-by: Denys Vlasenko --- src/string/x86_64/memset.s | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/string/x86_64/memset.s b/src/string/x86_64/memset.s index fc06eef8..263336b5 100644 --- a/src/string/x86_64/memset.s +++ b/src/string/x86_64/memset.s @@ -1,41 +1,43 @@ .global memset .type memset,@function memset: - and $0xff,%esi + movzbl %sil,%esi mov $0x101010101010101,%rax - mov %rdx,%rcx - mov %rdi,%r8 + # 64-bit imul has 3-7 cycles latency, launch early imul %rsi,%rax - cmp $16,%rcx + + cmp $16,%rdx jb 1f - mov %rax,-8(%rdi,%rcx) + mov %rdx,%rcx + mov %rdi,%r8 shr $3,%rcx + mov %rax,-8(%rdi,%rdx) rep stosq mov %r8,%rax ret -1: test %ecx,%ecx +1: test %edx,%edx jz 1f mov %al,(%rdi) - mov %al,-1(%rdi,%rcx) - cmp $2,%ecx + mov %al,-1(%rdi,%rdx) + cmp $2,%edx jbe 1f mov %al,1(%rdi) - mov %al,-2(%rdi,%rcx) - cmp $4,%ecx + mov %al,-2(%rdi,%rdx) + cmp $4,%edx jbe 1f mov %eax,(%rdi) - mov %eax,-4(%rdi,%rcx) - cmp $8,%ecx + mov %eax,-4(%rdi,%rdx) + cmp $8,%edx jbe 1f mov %eax,4(%rdi) - mov %eax,-8(%rdi,%rcx) + mov %eax,-8(%rdi,%rdx) -1: mov %r8,%rax +1: mov %rdi,%rax ret -- cgit 1.4.1