diff options
author | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-07-12 12:29:01 -0700 |
---|---|---|
committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-07-13 14:55:31 -0700 |
commit | 58e6cd4bcbe9f29949f1545953a17145bf732aa0 (patch) | |
tree | d7b1785858dee9ebb7cee91d6881127125b352cf | |
parent | 60a583ec601351c7d9b026e6a62ee6bab0cbf80b (diff) | |
download | glibc-58e6cd4bcbe9f29949f1545953a17145bf732aa0.tar.gz glibc-58e6cd4bcbe9f29949f1545953a17145bf732aa0.tar.xz glibc-58e6cd4bcbe9f29949f1545953a17145bf732aa0.zip |
x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S
This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch.
-rw-r--r-- | sysdeps/x86_64/multiarch/rtld-strlen.S | 18 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/rtld-strnlen.S | 18 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen-sse2.S | 260 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strlen-vec.S | 267 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strnlen-sse2.S | 12 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/strlen.S | 3 | ||||
-rw-r--r-- | sysdeps/x86_64/strnlen.S | 6 |
9 files changed, 306 insertions, 286 deletions
diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S new file mode 100644 index 0000000000..609d26256e --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strlen.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../strlen.S" diff --git a/sysdeps/x86_64/multiarch/rtld-strnlen.S b/sysdeps/x86_64/multiarch/rtld-strnlen.S new file mode 100644 index 0000000000..ef2d64abc2 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strnlen.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../strnlen.S" diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S index 660b327ed2..5be72267d5 100644 --- a/sysdeps/x86_64/multiarch/strlen-sse2.S +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S @@ -16,8 +16,260 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) -# define strlen __strlen_sse2 -#endif +#if IS_IN (libc) || defined STRLEN + +# ifndef STRLEN +# define STRLEN __strlen_sse2 +# endif + + +# include <sysdep.h> + +# ifdef AS_WCSLEN +# define PMINU pminud +# define PCMPEQ pcmpeqd +# define SHIFT_RETURN shrq $2, %rax +# else +# define PMINU pminub +# define PCMPEQ pcmpeqb +# define SHIFT_RETURN +# endif + +# ifndef SECTION +# define SECTION(p) p +# endif + +/* Long lived register in strlen(s), strnlen(s, n) are: + + %xmm3 - zero + %rdi - s + %r10 (s+n) & (~(64-1)) + %r11 s+n +*/ + + + .section SECTION(.text),"ax",@progbits +ENTRY(STRLEN) + +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ +# define FIND_ZERO \ + PCMPEQ (%rax), %xmm0; \ + PCMPEQ 16(%rax), %xmm1; \ + PCMPEQ 32(%rax), %xmm2; \ + PCMPEQ 48(%rax), %xmm3; \ + pmovmskb %xmm0, %esi; \ + pmovmskb %xmm1, %edx; \ + pmovmskb %xmm2, %r8d; \ + pmovmskb %xmm3, %ecx; \ + salq $16, %rdx; \ + salq $16, %rcx; \ + orq %rsi, %rdx; \ + orq %r8, %rcx; \ + salq $32, %rcx; \ + orq %rcx, %rdx; + +# ifdef AS_STRNLEN +/* Do not read anything when n==0. */ + test %RSI_LP, %RSI_LP + jne L(n_nonzero) + xor %rax, %rax + ret +L(n_nonzero): +# ifdef AS_WCSLEN +/* Check for overflow from maxlen * sizeof(wchar_t). If it would + overflow the only way this program doesn't have undefined behavior + is if there is a null terminator in valid memory so wcslen will + suffice. */ + mov %RSI_LP, %R10_LP + sar $62, %R10_LP + jnz __wcslen_sse4_1 + sal $2, %RSI_LP +# endif + +/* Initialize long lived registers. */ + add %RDI_LP, %RSI_LP + mov %RSI_LP, %R10_LP + and $-64, %R10_LP + mov %RSI_LP, %R11_LP +# endif + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + movq %rdi, %rax + movq %rdi, %rcx + andq $4095, %rcx +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ + cmpq $4047, %rcx +/* We cannot unify this branching as it would be ~6 cycles slower. */ + ja L(cross_page) + +# ifdef AS_STRNLEN +/* Test if end is among first 64 bytes. */ +# define STRNLEN_PROLOG \ + mov %r11, %rsi; \ + subq %rax, %rsi; \ + andq $-64, %rax; \ + testq $-64, %rsi; \ + je L(strnlen_ret) +# else +# define STRNLEN_PROLOG andq $-64, %rax; +# endif + +/* Ignore bits in mask that come before start of string. */ +# define PROLOG(lab) \ + movq %rdi, %rcx; \ + xorq %rax, %rcx; \ + STRNLEN_PROLOG; \ + sarq %cl, %rdx; \ + test %rdx, %rdx; \ + je L(lab); \ + bsfq %rdx, %rax; \ + SHIFT_RETURN; \ + ret + +# ifdef AS_STRNLEN + andq $-16, %rax + FIND_ZERO +# else + /* Test first 16 bytes unaligned. */ + movdqu (%rax), %xmm4 + PCMPEQ %xmm0, %xmm4 + pmovmskb %xmm4, %edx + test %edx, %edx + je L(next48_bytes) + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ + SHIFT_RETURN + ret + +L(next48_bytes): +/* Same as FIND_ZERO except we do not check first 16 bytes. */ + andq $-16, %rax + PCMPEQ 16(%rax), %xmm1 + PCMPEQ 32(%rax), %xmm2 + PCMPEQ 48(%rax), %xmm3 + pmovmskb %xmm1, %edx + pmovmskb %xmm2, %r8d + pmovmskb %xmm3, %ecx + salq $16, %rdx + salq $16, %rcx + orq %r8, %rcx + salq $32, %rcx + orq %rcx, %rdx +# endif -#include "strlen-vec.S" + /* When no zero byte is found xmm1-3 are zero so we do not have to + zero them. */ + PROLOG(loop) + + .p2align 4 +L(cross_page): + andq $-64, %rax + FIND_ZERO + PROLOG(loop_init) + +# ifdef AS_STRNLEN +/* We must do this check to correctly handle strnlen (s, -1). */ +L(strnlen_ret): + bts %rsi, %rdx + sarq %cl, %rdx + test %rdx, %rdx + je L(loop_init) + bsfq %rdx, %rax + SHIFT_RETURN + ret +# endif + .p2align 4 +L(loop_init): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 +# ifdef AS_STRNLEN + .p2align 4 +L(loop): + + addq $64, %rax + cmpq %rax, %r10 + je L(exit_end) + + movdqa (%rax), %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit) + jmp L(loop) + + .p2align 4 +L(exit_end): + cmp %rax, %r11 + je L(first) /* Do not read when end is at page boundary. */ + pxor %xmm0, %xmm0 + FIND_ZERO + +L(first): + bts %r11, %rdx + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + + .p2align 4 +L(exit): + pxor %xmm0, %xmm0 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + +# else + + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ + .p2align 4 +L(loop): + + movdqa 64(%rax), %xmm0 + PMINU 80(%rax), %xmm0 + PMINU 96(%rax), %xmm0 + PMINU 112(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit64) + + subq $-128, %rax + + movdqa (%rax), %xmm0 + PMINU 16(%rax), %xmm0 + PMINU 32(%rax), %xmm0 + PMINU 48(%rax), %xmm0 + PCMPEQ %xmm3, %xmm0 + pmovmskb %xmm0, %edx + testl %edx, %edx + jne L(exit0) + jmp L(loop) + + .p2align 4 +L(exit64): + addq $64, %rax +L(exit0): + pxor %xmm0, %xmm0 + FIND_ZERO + + bsfq %rdx, %rdx + addq %rdx, %rax + subq %rdi, %rax + SHIFT_RETURN + ret + +# endif + +END(STRLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S deleted file mode 100644 index 874123d604..0000000000 --- a/sysdeps/x86_64/multiarch/strlen-vec.S +++ /dev/null @@ -1,267 +0,0 @@ -/* SSE2 version of strlen and SSE4.1 version of wcslen. - Copyright (C) 2012-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#ifdef AS_WCSLEN -# define PMINU pminud -# define PCMPEQ pcmpeqd -# define SHIFT_RETURN shrq $2, %rax -#else -# define PMINU pminub -# define PCMPEQ pcmpeqb -# define SHIFT_RETURN -#endif - -#ifndef SECTION -# define SECTION(p) p -#endif - -/* Long lived register in strlen(s), strnlen(s, n) are: - - %xmm3 - zero - %rdi - s - %r10 (s+n) & (~(64-1)) - %r11 s+n -*/ - - - .section SECTION(.text),"ax",@progbits -ENTRY(strlen) - -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ -#define FIND_ZERO \ - PCMPEQ (%rax), %xmm0; \ - PCMPEQ 16(%rax), %xmm1; \ - PCMPEQ 32(%rax), %xmm2; \ - PCMPEQ 48(%rax), %xmm3; \ - pmovmskb %xmm0, %esi; \ - pmovmskb %xmm1, %edx; \ - pmovmskb %xmm2, %r8d; \ - pmovmskb %xmm3, %ecx; \ - salq $16, %rdx; \ - salq $16, %rcx; \ - orq %rsi, %rdx; \ - orq %r8, %rcx; \ - salq $32, %rcx; \ - orq %rcx, %rdx; - -#ifdef AS_STRNLEN -/* Do not read anything when n==0. */ - test %RSI_LP, %RSI_LP - jne L(n_nonzero) - xor %rax, %rax - ret -L(n_nonzero): -# ifdef AS_WCSLEN -/* Check for overflow from maxlen * sizeof(wchar_t). If it would - overflow the only way this program doesn't have undefined behavior - is if there is a null terminator in valid memory so wcslen will - suffice. */ - mov %RSI_LP, %R10_LP - sar $62, %R10_LP - jnz __wcslen_sse4_1 - sal $2, %RSI_LP -# endif - -/* Initialize long lived registers. */ - add %RDI_LP, %RSI_LP - mov %RSI_LP, %R10_LP - and $-64, %R10_LP - mov %RSI_LP, %R11_LP -#endif - - pxor %xmm0, %xmm0 - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 - movq %rdi, %rax - movq %rdi, %rcx - andq $4095, %rcx -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ - cmpq $4047, %rcx -/* We cannot unify this branching as it would be ~6 cycles slower. */ - ja L(cross_page) - -#ifdef AS_STRNLEN -/* Test if end is among first 64 bytes. */ -# define STRNLEN_PROLOG \ - mov %r11, %rsi; \ - subq %rax, %rsi; \ - andq $-64, %rax; \ - testq $-64, %rsi; \ - je L(strnlen_ret) -#else -# define STRNLEN_PROLOG andq $-64, %rax; -#endif - -/* Ignore bits in mask that come before start of string. */ -#define PROLOG(lab) \ - movq %rdi, %rcx; \ - xorq %rax, %rcx; \ - STRNLEN_PROLOG; \ - sarq %cl, %rdx; \ - test %rdx, %rdx; \ - je L(lab); \ - bsfq %rdx, %rax; \ - SHIFT_RETURN; \ - ret - -#ifdef AS_STRNLEN - andq $-16, %rax - FIND_ZERO -#else - /* Test first 16 bytes unaligned. */ - movdqu (%rax), %xmm4 - PCMPEQ %xmm0, %xmm4 - pmovmskb %xmm4, %edx - test %edx, %edx - je L(next48_bytes) - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ - SHIFT_RETURN - ret - -L(next48_bytes): -/* Same as FIND_ZERO except we do not check first 16 bytes. */ - andq $-16, %rax - PCMPEQ 16(%rax), %xmm1 - PCMPEQ 32(%rax), %xmm2 - PCMPEQ 48(%rax), %xmm3 - pmovmskb %xmm1, %edx - pmovmskb %xmm2, %r8d - pmovmskb %xmm3, %ecx - salq $16, %rdx - salq $16, %rcx - orq %r8, %rcx - salq $32, %rcx - orq %rcx, %rdx -#endif - - /* When no zero byte is found xmm1-3 are zero so we do not have to - zero them. */ - PROLOG(loop) - - .p2align 4 -L(cross_page): - andq $-64, %rax - FIND_ZERO - PROLOG(loop_init) - -#ifdef AS_STRNLEN -/* We must do this check to correctly handle strnlen (s, -1). */ -L(strnlen_ret): - bts %rsi, %rdx - sarq %cl, %rdx - test %rdx, %rdx - je L(loop_init) - bsfq %rdx, %rax - SHIFT_RETURN - ret -#endif - .p2align 4 -L(loop_init): - pxor %xmm1, %xmm1 - pxor %xmm2, %xmm2 - pxor %xmm3, %xmm3 -#ifdef AS_STRNLEN - .p2align 4 -L(loop): - - addq $64, %rax - cmpq %rax, %r10 - je L(exit_end) - - movdqa (%rax), %xmm0 - PMINU 16(%rax), %xmm0 - PMINU 32(%rax), %xmm0 - PMINU 48(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit) - jmp L(loop) - - .p2align 4 -L(exit_end): - cmp %rax, %r11 - je L(first) /* Do not read when end is at page boundary. */ - pxor %xmm0, %xmm0 - FIND_ZERO - -L(first): - bts %r11, %rdx - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - - .p2align 4 -L(exit): - pxor %xmm0, %xmm0 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - -#else - - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ - .p2align 4 -L(loop): - - movdqa 64(%rax), %xmm0 - PMINU 80(%rax), %xmm0 - PMINU 96(%rax), %xmm0 - PMINU 112(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit64) - - subq $-128, %rax - - movdqa (%rax), %xmm0 - PMINU 16(%rax), %xmm0 - PMINU 32(%rax), %xmm0 - PMINU 48(%rax), %xmm0 - PCMPEQ %xmm3, %xmm0 - pmovmskb %xmm0, %edx - testl %edx, %edx - jne L(exit0) - jmp L(loop) - - .p2align 4 -L(exit64): - addq $64, %rax -L(exit0): - pxor %xmm0, %xmm0 - FIND_ZERO - - bsfq %rdx, %rdx - addq %rdx, %rax - subq %rdi, %rax - SHIFT_RETURN - ret - -#endif - -END(strlen) diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S index c4f395c210..a50c7d6a28 100644 --- a/sysdeps/x86_64/multiarch/strnlen-sse2.S +++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S @@ -17,12 +17,10 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) -# define __strnlen __strnlen_sse2 - -# undef weak_alias -# define weak_alias(__strnlen, strnlen) -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strnlen) +# ifndef STRLEN +# define STRLEN __strnlen_sse2 +# endif #endif -#include "../strnlen.S" +#define AS_STRNLEN +#include "strlen-sse2.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S index e306a77f51..c88e8342a1 100644 --- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S @@ -1,5 +1,5 @@ #define AS_WCSLEN -#define strlen __wcslen_sse4_1 +#define STRLEN __wcslen_sse4_1 #define SECTION(p) p##.sse4.1 -#include "strlen-vec.S" +#include "strlen-sse2.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S index d2f7dd6e22..17cdedc2a9 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S @@ -1,6 +1,6 @@ #define AS_WCSLEN #define AS_STRNLEN -#define strlen __wcsnlen_sse4_1 +#define STRLEN __wcsnlen_sse4_1 #define SECTION(p) p##.sse4.1 -#include "strlen-vec.S" +#include "strlen-sse2.S" diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S index e1f0b19f2f..c2f5674f8d 100644 --- a/sysdeps/x86_64/strlen.S +++ b/sysdeps/x86_64/strlen.S @@ -16,6 +16,7 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#include "multiarch/strlen-vec.S" +#define STRLEN strlen +#include "multiarch/strlen-sse2.S" libc_hidden_builtin_def (strlen) diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S index d3c43ac482..174970d58f 100644 --- a/sysdeps/x86_64/strnlen.S +++ b/sysdeps/x86_64/strnlen.S @@ -1,6 +1,6 @@ -#define AS_STRNLEN -#define strlen __strnlen -#include "strlen.S" +#define STRLEN __strnlen +#include "multiarch/strnlen-sse2.S" +libc_hidden_def (__strnlen) weak_alias (__strnlen, strnlen); libc_hidden_builtin_def (strnlen) |