diff options
author | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-07-12 12:29:08 -0700 |
---|---|---|
committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-07-13 14:55:31 -0700 |
commit | e19bb87c97a3a109c418f68cebbea27ebc2808f9 (patch) | |
tree | 8762cdb41609a65bd601c9bbc3608649ec0e6c8a /sysdeps | |
parent | 64479f11b721fa33d17d623db31d047a11f363a1 (diff) | |
download | glibc-e19bb87c97a3a109c418f68cebbea27ebc2808f9.tar.gz glibc-e19bb87c97a3a109c418f68cebbea27ebc2808f9.tar.xz glibc-e19bb87c97a3a109c418f68cebbea27ebc2808f9.zip |
x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S
This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch.
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/x86_64/multiarch/wcslen-sse2.S | 221 | ||||
-rw-r--r-- | sysdeps/x86_64/wcslen.S | 216 |
2 files changed, 218 insertions, 219 deletions
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S index 2b3a9efd64..944c3bd9c6 100644 --- a/sysdeps/x86_64/multiarch/wcslen-sse2.S +++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S @@ -17,10 +17,221 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) -# define __wcslen __wcslen_sse2 - -# undef weak_alias -# define weak_alias(__wcslen, wcslen) +# ifndef WCSLEN +# define WCSLEN __wcslen_sse2 +# endif #endif -#include "../wcslen.S" +#include <sysdep.h> + + .text +ENTRY (WCSLEN) + cmpl $0, (%rdi) + jz L(exit_tail0) + cmpl $0, 4(%rdi) + jz L(exit_tail1) + cmpl $0, 8(%rdi) + jz L(exit_tail2) + cmpl $0, 12(%rdi) + jz L(exit_tail3) + cmpl $0, 16(%rdi) + jz L(exit_tail4) + cmpl $0, 20(%rdi) + jz L(exit_tail5) + cmpl $0, 24(%rdi) + jz L(exit_tail6) + cmpl $0, 28(%rdi) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%rdi), %rax + addq $16, %rdi + and $-16, %rax + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + and $-0x40, %rax + + .p2align 4 +L(aligned_64_loop): + movaps (%rax), %xmm0 + movaps 16(%rax), %xmm1 + movaps 32(%rax), %xmm2 + movaps 48(%rax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + addq $64, %rax + test %edx, %edx + jz L(aligned_64_loop) + + pcmpeqd -64(%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $48, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd -32(%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jz L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %rdi, %rax + shr $2, %rax + test %dl, %dl + jz L(exit_high) + + andl $15, %edx + jz L(exit_1) + ret + + /* No align here. Naturally aligned % 16 == 1. */ +L(exit_high): + andl $(15 << 8), %edx + jz L(exit_3) + add $2, %rax + ret + + .p2align 3 +L(exit_1): + add $1, %rax + ret + + .p2align 3 +L(exit_3): + add $3, %rax + ret + + .p2align 3 +L(exit_tail0): + xorl %eax, %eax + ret + + .p2align 3 +L(exit_tail1): + movl $1, %eax + ret + + .p2align 3 +L(exit_tail2): + movl $2, %eax + ret + + .p2align 3 +L(exit_tail3): + movl $3, %eax + ret + + .p2align 3 +L(exit_tail4): + movl $4, %eax + ret + + .p2align 3 +L(exit_tail5): + movl $5, %eax + ret + + .p2align 3 +L(exit_tail6): + movl $6, %eax + ret + + .p2align 3 +L(exit_tail7): + movl $7, %eax + ret + +END (WCSLEN) diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S index d641141d75..588a0fbe01 100644 --- a/sysdeps/x86_64/wcslen.S +++ b/sysdeps/x86_64/wcslen.S @@ -16,218 +16,6 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include <sysdep.h> - - .text -ENTRY (__wcslen) - cmpl $0, (%rdi) - jz L(exit_tail0) - cmpl $0, 4(%rdi) - jz L(exit_tail1) - cmpl $0, 8(%rdi) - jz L(exit_tail2) - cmpl $0, 12(%rdi) - jz L(exit_tail3) - cmpl $0, 16(%rdi) - jz L(exit_tail4) - cmpl $0, 20(%rdi) - jz L(exit_tail5) - cmpl $0, 24(%rdi) - jz L(exit_tail6) - cmpl $0, 28(%rdi) - jz L(exit_tail7) - - pxor %xmm0, %xmm0 - - lea 32(%rdi), %rax - addq $16, %rdi - and $-16, %rax - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - and $-0x40, %rax - - .p2align 4 -L(aligned_64_loop): - movaps (%rax), %xmm0 - movaps 16(%rax), %xmm1 - movaps 32(%rax), %xmm2 - movaps 48(%rax), %xmm6 - - pminub %xmm1, %xmm0 - pminub %xmm6, %xmm2 - pminub %xmm0, %xmm2 - pcmpeqd %xmm3, %xmm2 - pmovmskb %xmm2, %edx - addq $64, %rax - test %edx, %edx - jz L(aligned_64_loop) - - pcmpeqd -64(%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $48, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd %xmm1, %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd -32(%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd %xmm6, %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jz L(aligned_64_loop) - - .p2align 4 -L(exit): - sub %rdi, %rax - shr $2, %rax - test %dl, %dl - jz L(exit_high) - - andl $15, %edx - jz L(exit_1) - ret - - /* No align here. Naturally aligned % 16 == 1. */ -L(exit_high): - andl $(15 << 8), %edx - jz L(exit_3) - add $2, %rax - ret - - .p2align 3 -L(exit_1): - add $1, %rax - ret - - .p2align 3 -L(exit_3): - add $3, %rax - ret - - .p2align 3 -L(exit_tail0): - xorl %eax, %eax - ret - - .p2align 3 -L(exit_tail1): - movl $1, %eax - ret - - .p2align 3 -L(exit_tail2): - movl $2, %eax - ret - - .p2align 3 -L(exit_tail3): - movl $3, %eax - ret - - .p2align 3 -L(exit_tail4): - movl $4, %eax - ret - - .p2align 3 -L(exit_tail5): - movl $5, %eax - ret - - .p2align 3 -L(exit_tail6): - movl $6, %eax - ret - - .p2align 3 -L(exit_tail7): - movl $7, %eax - ret - -END (__wcslen) - +#define WCSLEN __wcslen +#include "multiarch/wcslen-sse2.S" weak_alias(__wcslen, wcslen) |