diff options
author | Ondrej Bilka <neleai@seznam.cz> | 2013-03-06 21:41:32 +0100 |
---|---|---|
committer | Ondrej Bilka <neleai@seznam.cz> | 2013-03-06 21:54:01 +0100 |
commit | b79188d71716b6286866e06add976fe84100595e (patch) | |
tree | 194a834b5c018ad08b13ed18ef8d9b708ad832c7 /sysdeps/x86_64/multiarch/strcat-ssse3.S | |
parent | 39120df9b9e2ad9674c0d6265757a7a68e10051f (diff) | |
download | glibc-b79188d71716b6286866e06add976fe84100595e.tar.gz glibc-b79188d71716b6286866e06add976fe84100595e.tar.xz glibc-b79188d71716b6286866e06add976fe84100595e.zip |
* sysdeps/x86_64/strlen.S: Replace with new SSE2 based implementation
which is faster on all x86_64 architectures. Tested on AMD, Intel Nehalem, SNB, IVB.
Diffstat (limited to 'sysdeps/x86_64/multiarch/strcat-ssse3.S')
-rw-r--r-- | sysdeps/x86_64/multiarch/strcat-ssse3.S | 312 |
1 files changed, 309 insertions, 3 deletions
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S index fea9d11b40..901e66f2c8 100644 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -33,11 +33,317 @@ ENTRY (STRCAT) mov %rdx, %r8 # endif -# define RETURN jmp L(StartStrcpyPart) -# include "strlen-sse2-no-bsf.S" + xor %eax, %eax + cmpb $0, (%rdi) + jz L(exit_tail0) + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmpb $0, 3(%rdi) + jz L(exit_tail3) + + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmpb $0, 7(%rdi) + jz L(exit_tail7) + + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmpb $0, 11(%rdi) + jz L(exit_tail11) + + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmpb $0, 15(%rdi) + jz L(exit_tail15) + pxor %xmm0, %xmm0 + lea 16(%rdi), %rcx + lea 16(%rdi), %rax + and $-16, %rax + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax -# undef RETURN + .p2align 4 +L(aligned_64): + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %r11d + pmovmskb %xmm2, %r10d + pmovmskb %xmm3, %r9d + or %edx, %r9d + or %r11d, %r9d + or %r10d, %r9d + lea 64(%rax), %rax + jz L(aligned_64) + + test %edx, %edx + jnz L(aligned_64_exit_16) + test %r11d, %r11d + jnz L(aligned_64_exit_32) + test %r10d, %r10d + jnz L(aligned_64_exit_48) + +L(aligned_64_exit_64): + pmovmskb %xmm3, %edx + jmp L(exit) + +L(aligned_64_exit_48): + lea -16(%rax), %rax + mov %r10d, %edx + jmp L(exit) + +L(aligned_64_exit_32): + lea -32(%rax), %rax + mov %r11d, %edx + jmp L(exit) + +L(aligned_64_exit_16): + lea -48(%rax), %rax + +L(exit): + sub %rcx, %rax + test %dl, %dl + jz L(exit_high) + test $0x01, %dl + jnz L(exit_tail0) + + test $0x02, %dl + jnz L(exit_tail1) + + test $0x04, %dl + jnz L(exit_tail2) + + test $0x08, %dl + jnz L(exit_tail3) + + test $0x10, %dl + jnz L(exit_tail4) + + test $0x20, %dl + jnz L(exit_tail5) + + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax +L(exit_tail0): + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_high): + add $8, %eax + test $0x01, %dh + jnz L(exit_tail0) + + test $0x02, %dh + jnz L(exit_tail1) + + test $0x04, %dh + jnz L(exit_tail2) + + test $0x08, %dh + jnz L(exit_tail3) + + test $0x10, %dh + jnz L(exit_tail4) + + test $0x20, %dh + jnz L(exit_tail5) + + test $0x40, %dh + jnz L(exit_tail6) + add $7, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail1): + add $1, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail2): + add $2, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail3): + add $3, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail4): + add $4, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail5): + add $5, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail6): + add $6, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail7): + add $7, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail8): + add $8, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail9): + add $9, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail10): + add $10, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail11): + add $11, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail12): + add $12, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail13): + add $13, %eax + jmp L(StartStrcpyPart) + .p2align 4 +L(exit_tail14): + add $14, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail15): + add $15, %eax + + .p2align 4 L(StartStrcpyPart): mov %rsi, %rcx lea (%rdi, %rax), %rdx |