diff options
author | H.J. Lu <hongjiu.lu@intel.com> | 2010-08-26 22:09:34 -0700 |
---|---|---|
committer | Ulrich Drepper <drepper@redhat.com> | 2010-08-26 22:09:34 -0700 |
commit | 623aac7f84dfddee9bcf9d51f23612479cf672ec (patch) | |
tree | 355c57e1d98cff706ead0832461b060bc24ffc7c /sysdeps/x86_64/multiarch/strlen-sse4.S | |
parent | b416a900856ff871c06b08fa2c9c943fd86597da (diff) | |
download | glibc-623aac7f84dfddee9bcf9d51f23612479cf672ec.tar.gz glibc-623aac7f84dfddee9bcf9d51f23612479cf672ec.tar.xz glibc-623aac7f84dfddee9bcf9d51f23612479cf672ec.zip |
Unroll x86-64 strlen
Diffstat (limited to 'sysdeps/x86_64/multiarch/strlen-sse4.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen-sse4.S | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S new file mode 100644 index 0000000000..6b16ea7fa6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-sse4.S @@ -0,0 +1,85 @@ +/* strlen with SSE4 + Copyright (C) 2009, 2010 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@redhat.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#if defined SHARED && !defined NOT_IN_libc + +#include <sysdep.h> + + .section .text.sse4.2,"ax",@progbits +ENTRY (__strlen_sse42) + pxor %xmm1, %xmm1 + movl %edi, %ecx + movq %rdi, %r8 + andq $~15, %rdi + xor %edi, %ecx + pcmpeqb (%rdi), %xmm1 + pmovmskb %xmm1, %edx + shrl %cl, %edx + shll %cl, %edx + andl %edx, %edx + jnz L(less16bytes) + pxor %xmm1, %xmm1 + + .p2align 4 +L(more64bytes_loop): + pcmpistri $0x08, 16(%rdi), %xmm1 + jz L(more32bytes) + + pcmpistri $0x08, 32(%rdi), %xmm1 + jz L(more48bytes) + + pcmpistri $0x08, 48(%rdi), %xmm1 + jz L(more64bytes) + + add $64, %rdi + pcmpistri $0x08, (%rdi), %xmm1 + jnz L(more64bytes_loop) + leaq (%rdi,%rcx), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more32bytes): + leaq 16(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more48bytes): + leaq 32(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(more64bytes): + leaq 48(%rdi,%rcx, 1), %rax + subq %r8, %rax + ret + + .p2align 4 +L(less16bytes): + subq %r8, %rdi + bsfl %edx, %eax + addq %rdi, %rax + ret + +END (__strlen_sse42) + +#endif |