From cd080d07410426c5ce211509eb0d8fd0901f673a Mon Sep 17 00:00:00 2001 From: Noah Goldstein Date: Tue, 12 Jul 2022 12:29:05 -0700 Subject: x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. --- sysdeps/x86_64/multiarch/rtld-strchr.S | 18 +++ sysdeps/x86_64/multiarch/rtld-strchrnul.S | 18 +++ sysdeps/x86_64/multiarch/strchr-sse2.S | 175 ++++++++++++++++++++++++++++-- sysdeps/x86_64/multiarch/strchrnul-sse2.S | 11 +- 4 files changed, 209 insertions(+), 13 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/rtld-strchr.S create mode 100644 sysdeps/x86_64/multiarch/rtld-strchrnul.S (limited to 'sysdeps/x86_64/multiarch') diff --git a/sysdeps/x86_64/multiarch/rtld-strchr.S b/sysdeps/x86_64/multiarch/rtld-strchr.S new file mode 100644 index 0000000000..2b7b879e37 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strchr.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "../strchr.S" diff --git a/sysdeps/x86_64/multiarch/rtld-strchrnul.S b/sysdeps/x86_64/multiarch/rtld-strchrnul.S new file mode 100644 index 0000000000..0cc5becc88 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strchrnul.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "../strchrnul.S" diff --git a/sysdeps/x86_64/multiarch/strchr-sse2.S b/sysdeps/x86_64/multiarch/strchr-sse2.S index 992f700077..f7767ca543 100644 --- a/sysdeps/x86_64/multiarch/strchr-sse2.S +++ b/sysdeps/x86_64/multiarch/strchr-sse2.S @@ -16,13 +16,172 @@ License along with the GNU C Library; if not, see . */ -#if IS_IN (libc) -# define strchr __strchr_sse2 +#if IS_IN (libc) || defined STRCHR +# ifndef STRCHR +# define STRCHR __strchr_sse2 +# endif -# undef weak_alias -# define weak_alias(strchr, index) -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strchr) -#endif +# include + + .text +ENTRY (STRCHR) + movd %esi, %xmm1 + movl %edi, %eax + andl $4095, %eax + punpcklbw %xmm1, %xmm1 + cmpl $4032, %eax + punpcklwd %xmm1, %xmm1 + pshufd $0, %xmm1, %xmm1 + jg L(cross_page) + movdqu (%rdi), %xmm0 + pxor %xmm3, %xmm3 + movdqa %xmm0, %xmm4 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm3, %xmm4 + por %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + je L(next_48_bytes) + bsf %eax, %eax +# ifdef AS_STRCHRNUL + leaq (%rdi,%rax), %rax +# else + movl $0, %edx + leaq (%rdi,%rax), %rax + cmpb %sil, (%rax) + cmovne %rdx, %rax +# endif + ret + + .p2align 3 +L(next_48_bytes): + movdqu 16(%rdi), %xmm0 + movdqa %xmm0, %xmm4 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm3, %xmm4 + por %xmm4, %xmm0 + pmovmskb %xmm0, %ecx + movdqu 32(%rdi), %xmm0 + movdqa %xmm0, %xmm4 + pcmpeqb %xmm1, %xmm0 + salq $16, %rcx + pcmpeqb %xmm3, %xmm4 + por %xmm4, %xmm0 + pmovmskb %xmm0, %eax + movdqu 48(%rdi), %xmm0 + pcmpeqb %xmm0, %xmm3 + salq $32, %rax + pcmpeqb %xmm1, %xmm0 + orq %rcx, %rax + por %xmm3, %xmm0 + pmovmskb %xmm0, %ecx + salq $48, %rcx + orq %rcx, %rax + testq %rax, %rax + jne L(return) +L(loop_start): + /* We use this alignment to force loop be aligned to 8 but not + 16 bytes. This gives better sheduling on AMD processors. */ + .p2align 4 + pxor %xmm6, %xmm6 + andq $-64, %rdi + .p2align 3 +L(loop64): + addq $64, %rdi + movdqa (%rdi), %xmm5 + movdqa 16(%rdi), %xmm2 + movdqa 32(%rdi), %xmm3 + pxor %xmm1, %xmm5 + movdqa 48(%rdi), %xmm4 + pxor %xmm1, %xmm2 + pxor %xmm1, %xmm3 + pminub (%rdi), %xmm5 + pxor %xmm1, %xmm4 + pminub 16(%rdi), %xmm2 + pminub 32(%rdi), %xmm3 + pminub %xmm2, %xmm5 + pminub 48(%rdi), %xmm4 + pminub %xmm3, %xmm5 + pminub %xmm4, %xmm5 + pcmpeqb %xmm6, %xmm5 + pmovmskb %xmm5, %eax + + testl %eax, %eax + je L(loop64) -#include "../strchr.S" + movdqa (%rdi), %xmm5 + movdqa %xmm5, %xmm0 + pcmpeqb %xmm1, %xmm5 + pcmpeqb %xmm6, %xmm0 + por %xmm0, %xmm5 + pcmpeqb %xmm6, %xmm2 + pcmpeqb %xmm6, %xmm3 + pcmpeqb %xmm6, %xmm4 + + pmovmskb %xmm5, %ecx + pmovmskb %xmm2, %eax + salq $16, %rax + pmovmskb %xmm3, %r8d + pmovmskb %xmm4, %edx + salq $32, %r8 + orq %r8, %rax + orq %rcx, %rax + salq $48, %rdx + orq %rdx, %rax + .p2align 3 +L(return): + bsfq %rax, %rax +# ifdef AS_STRCHRNUL + leaq (%rdi,%rax), %rax +# else + movl $0, %edx + leaq (%rdi,%rax), %rax + cmpb %sil, (%rax) + cmovne %rdx, %rax +# endif + ret + .p2align 4 + +L(cross_page): + movq %rdi, %rdx + pxor %xmm2, %xmm2 + andq $-64, %rdx + movdqa %xmm1, %xmm0 + movdqa (%rdx), %xmm3 + movdqa %xmm3, %xmm4 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + por %xmm4, %xmm3 + pmovmskb %xmm3, %r8d + movdqa 16(%rdx), %xmm3 + movdqa %xmm3, %xmm4 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + por %xmm4, %xmm3 + pmovmskb %xmm3, %eax + movdqa 32(%rdx), %xmm3 + movdqa %xmm3, %xmm4 + pcmpeqb %xmm1, %xmm3 + salq $16, %rax + pcmpeqb %xmm2, %xmm4 + por %xmm4, %xmm3 + pmovmskb %xmm3, %r9d + movdqa 48(%rdx), %xmm3 + pcmpeqb %xmm3, %xmm2 + salq $32, %r9 + pcmpeqb %xmm3, %xmm0 + orq %r9, %rax + orq %r8, %rax + por %xmm2, %xmm0 + pmovmskb %xmm0, %ecx + salq $48, %rcx + orq %rcx, %rax + movl %edi, %ecx + subb %dl, %cl + shrq %cl, %rax + testq %rax, %rax + jne L(return) + jmp L(loop_start) + +END (STRCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S index f91c670369..7238977a21 100644 --- a/sysdeps/x86_64/multiarch/strchrnul-sse2.S +++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S @@ -17,10 +17,11 @@ . */ #if IS_IN (libc) -# define __strchrnul __strchrnul_sse2 - -# undef weak_alias -# define weak_alias(__strchrnul, strchrnul) +# ifndef STRCHR +# define STRCHR __strchrnul_sse2 +# endif #endif -#include "../strchrnul.S" +#define AS_STRCHRNUL + +#include "strchr-sse2.S" -- cgit 1.4.1