diff options
author | H.J. Lu <hongjiu.lu@intel.com> | 2010-01-13 07:51:48 -0800 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2010-01-13 07:51:48 -0800 |
commit | 5a7af22fbb661e351473bfde639beeac16fc32f7 (patch) | |
tree | 70b559065a13a15cf87e1907c35cda904f10a64f /sysdeps/x86_64/multiarch/strlen.S | |
parent | 52e96a8092e4c0bfef5bd9748b35861bba6f6b88 (diff) | |
download | glibc-5a7af22fbb661e351473bfde639beeac16fc32f7.tar.gz glibc-5a7af22fbb661e351473bfde639beeac16fc32f7.tar.xz glibc-5a7af22fbb661e351473bfde639beeac16fc32f7.zip |
Unroll the loop x86-64 SSE4.2 strlen.
Diffstat (limited to 'sysdeps/x86_64/multiarch/strlen.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen.S | 60 |
1 files changed, 45 insertions, 15 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S index 509f9c9605..f9641131fa 100644 --- a/sysdeps/x86_64/multiarch/strlen.S +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -46,28 +46,58 @@ END(strlen) __strlen_sse42: cfi_startproc CALL_MCOUNT - pxor %xmm2, %xmm2 - movq %rdi, %rcx + pxor %xmm1, %xmm1 + movl %edi, %ecx movq %rdi, %r8 andq $~15, %rdi - movdqa %xmm2, %xmm1 - pcmpeqb (%rdi), %xmm2 - orl $0xffffffff, %esi - subq %rdi, %rcx - shll %cl, %esi - pmovmskb %xmm2, %edx - andl %esi, %edx - jnz 1f - -2: pcmpistri $0x08, 16(%rdi), %xmm1 - leaq 16(%rdi), %rdi - jnz 2b + xor %edi, %ecx + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %edx + shrl %cl, %edx + shll %cl, %edx + andl %edx, %edx + jnz L(less16bytes) + pxor %xmm1, %xmm1 + .p2align 4 +L(more64bytes_loop): + pcmpistri $0x08, 16(%rdi), %xmm1 + jz L(more32bytes) + + pcmpistri $0x08, 32(%rdi), %xmm1 + jz L(more48bytes) + + pcmpistri $0x08, 48(%rdi), %xmm1 + jz L(more64bytes) + + add $64, %rdi + pcmpistri $0x08, (%rdi), %xmm1 + jnz L(more64bytes_loop) leaq (%rdi,%rcx), %rax subq %r8, %rax ret -1: subq %r8, %rdi + .p2align 4 +L(more32bytes): + leaq 16(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more48bytes): + leaq 32(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more64bytes): + leaq 48(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(less16bytes): + subq %r8, %rdi bsfl %edx, %eax addq %rdi, %rax ret |