diff options
author | Zack Weinberg <zackw@panix.com> | 2017-06-08 15:39:03 -0400 |
---|---|---|
committer | Zack Weinberg <zackw@panix.com> | 2017-06-08 15:39:03 -0400 |
commit | 5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch) | |
tree | 4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/i386/i686/multiarch | |
parent | 199fc19d3aaaf57944ef036e15904febe877fc93 (diff) | |
download | glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.xz glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip |
Prepare for radical source tree reorganization. zack/build-layout-experiment
All top-level files and directories are moved into a temporary storage directory, REORG.TODO, except for files that will certainly still exist in their current form at top level when we're done (COPYING, COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which are moved to the new directory OldChangeLogs, instead), and the generated file INSTALL (which is just deleted; in the new order, there will be no generated files checked into version control).
Diffstat (limited to 'REORG.TODO/sysdeps/i386/i686/multiarch')
127 files changed, 32113 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile new file mode 100644 index 0000000000..4a0c20c051 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile @@ -0,0 +1,44 @@ +ifeq ($(subdir),csu) +tests += test-multiarch +endif + +ifeq ($(subdir),string) +gen-as-const-headers += locale-defines.sym +sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ + memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \ + memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ + memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \ + strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ + memcmp-ssse3 memcmp-sse4 varshift \ + strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \ + strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \ + strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \ + strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \ + strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \ + memchr-sse2 memchr-sse2-bsf \ + memrchr-sse2 memrchr-sse2-bsf memrchr-c \ + rawmemchr-sse2 rawmemchr-sse2-bsf \ + strnlen-sse2 strnlen-c \ + strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \ + strncase_l-c strncase-c strncase_l-ssse3 \ + strcasecmp_l-sse4 strncase_l-sse4 \ + bcopy-sse2-unaligned memcpy-sse2-unaligned \ + mempcpy-sse2-unaligned memmove-sse2-unaligned \ + strcspn-c strpbrk-c strspn-c +CFLAGS-varshift.c += -msse4 +CFLAGS-strcspn-c.c += -msse4 +CFLAGS-strpbrk-c.c += -msse4 +CFLAGS-strspn-c.c += -msse4 +endif + +ifeq ($(subdir),wcsmbs) +sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \ + wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \ + wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c +endif + +ifeq ($(subdir),math) +libm-sysdep_routines += s_fma-fma s_fmaf-fma +CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse +CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse +endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S new file mode 100644 index 0000000000..efef2a10dd --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S new file mode 100644 index 0000000000..cbc8b420e8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S new file mode 100644 index 0000000000..36aac44b9c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S new file mode 100644 index 0000000000..877f82c28f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S @@ -0,0 +1,59 @@ +/* Multiple versions of bcopy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(bcopy) + .type bcopy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__bcopy_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__bcopy_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep) +2: ret +END(bcopy) + +# undef ENTRY +# define ENTRY(name) \ + .type __bcopy_ia32, @function; \ + .p2align 4; \ + .globl __bcopy_ia32; \ + .hidden __bcopy_ia32; \ + __bcopy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32 + +#endif + +#include "../bcopy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S new file mode 100644 index 0000000000..507b288bb3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S @@ -0,0 +1,3 @@ +#define USE_AS_BZERO +#define __memset_sse2_rep __bzero_sse2_rep +#include "memset-sse2-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S new file mode 100644 index 0000000000..8d04512e4e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_BZERO +#define __memset_sse2 __bzero_sse2 +#include "memset-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S new file mode 100644 index 0000000000..9dac490aa2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S @@ -0,0 +1,62 @@ +/* Multiple versions of bzero + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(__bzero) + .type __bzero, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__bzero_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX ( __bzero_sse2) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__bzero_sse2_rep) +2: ret +END(__bzero) + +# undef ENTRY +# define ENTRY(name) \ + .type __bzero_ia32, @function; \ + .p2align 4; \ + .globl __bzero_ia32; \ + .hidden __bzero_ia32; \ + __bzero_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __bzero_ia32, .-__bzero_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI___bzero; __GI___bzero = __bzero_ia32 +# endif +#endif + +#include "../bzero.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c new file mode 100644 index 0000000000..e8026a2a78 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c @@ -0,0 +1,376 @@ +/* Enumerate available IFUNC implementations of a function. i686 version. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <assert.h> +#include <string.h> +#include <wchar.h> +#include <ifunc-impl-list.h> +#include "init-arch.h" + +/* Maximum number of IFUNC implementations. */ +#define MAX_IFUNC 4 + +/* Fill ARRAY of MAX elements with IFUNC implementations for function + NAME and return the number of valid entries. */ + +size_t +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t max) +{ + assert (max >= MAX_IFUNC); + + size_t i = 0; + + /* Support sysdeps/i386/i686/multiarch/bcopy.S. */ + IFUNC_IMPL (i, name, bcopy, + IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3), + __bcopy_ssse3_rep) + IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3), + __bcopy_ssse3) + IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2), + __bcopy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/bzero.S. */ + IFUNC_IMPL (i, name, bzero, + IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2), + __bzero_sse2_rep) + IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2), + __bzero_sse2) + IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memchr.S. */ + IFUNC_IMPL (i, name, memchr, + IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2), + __memchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2), + __memchr_sse2) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memcmp.S. */ + IFUNC_IMPL (i, name, memcmp, + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2), + __memcmp_sse4_2) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), + __memcmp_ssse3) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memmove_chk.S. */ + IFUNC_IMPL (i, name, __memmove_chk, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), + __memmove_chk_ssse3_rep) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), + __memmove_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSE2), + __memmove_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memmove.S. */ + IFUNC_IMPL (i, name, memmove, + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), + __memmove_ssse3_rep) + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), + __memmove_ssse3) + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2), + __memmove_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memrchr.S. */ + IFUNC_IMPL (i, name, memrchr, + IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2), + __memrchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2), + __memrchr_sse2) + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memset_chk.S. */ + IFUNC_IMPL (i, name, __memset_chk, + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_CPU_FEATURE (SSE2), + __memset_chk_sse2_rep) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_CPU_FEATURE (SSE2), + __memset_chk_sse2) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memset.S. */ + IFUNC_IMPL (i, name, memset, + IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2), + __memset_sse2_rep) + IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2), + __memset_sse2) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32)) + + /* Support sysdeps/i386/i686/multiarch/rawmemchr.S. */ + IFUNC_IMPL (i, name, rawmemchr, + IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2), + __rawmemchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2), + __rawmemchr_sse2) + IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/stpncpy.S. */ + IFUNC_IMPL (i, name, stpncpy, + IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3), + __stpncpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSE2), + __stpncpy_sse2) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/stpcpy.S. */ + IFUNC_IMPL (i, name, stpcpy, + IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3), + __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSE2), + __stpcpy_sse2) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcasecmp.S. */ + IFUNC_IMPL (i, name, strcasecmp, + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSE4_2), + __strcasecmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSSE3), + __strcasecmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcasecmp_l.S. */ + IFUNC_IMPL (i, name, strcasecmp_l, + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSE4_2), + __strcasecmp_l_sse4_2) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSSE3), + __strcasecmp_l_ssse3) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, + __strcasecmp_l_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcat.S. */ + IFUNC_IMPL (i, name, strcat, + IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3), + __strcat_ssse3) + IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2), + __strcat_sse2) + IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strchr.S. */ + IFUNC_IMPL (i, name, strchr, + IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2), + __strchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2), + __strchr_sse2) + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcmp.S. */ + IFUNC_IMPL (i, name, strcmp, + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2), + __strcmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3), + __strcmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcpy.S. */ + IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3), + __strcpy_ssse3) + IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSE2), + __strcpy_sse2) + IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcspn.S. */ + IFUNC_IMPL (i, name, strcspn, + IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2), + __strcspn_sse42) + IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncase.S. */ + IFUNC_IMPL (i, name, strncasecmp, + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSE4_2), + __strncasecmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSSE3), + __strncasecmp_ssse3) + IFUNC_IMPL_ADD (array, i, strncasecmp, 1, + __strncasecmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncase_l.S. */ + IFUNC_IMPL (i, name, strncasecmp_l, + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSE4_2), + __strncasecmp_l_sse4_2) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSSE3), + __strncasecmp_l_ssse3) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, + __strncasecmp_l_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncat.S. */ + IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3), + __strncat_ssse3) + IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2), + __strncat_sse2) + IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncpy.S. */ + IFUNC_IMPL (i, name, strncpy, + IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3), + __strncpy_ssse3) + IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSE2), + __strncpy_sse2) + IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strnlen.S. */ + IFUNC_IMPL (i, name, strnlen, + IFUNC_IMPL_ADD (array, i, strnlen, HAS_CPU_FEATURE (SSE2), + __strnlen_sse2) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strpbrk.S. */ + IFUNC_IMPL (i, name, strpbrk, + IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2), + __strpbrk_sse42) + IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strrchr.S. */ + IFUNC_IMPL (i, name, strrchr, + IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2), + __strrchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2), + __strrchr_sse2) + IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strspn.S. */ + IFUNC_IMPL (i, name, strspn, + IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2), + __strspn_sse42) + IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcschr.S. */ + IFUNC_IMPL (i, name, wcschr, + IFUNC_IMPL_ADD (array, i, wcschr, HAS_CPU_FEATURE (SSE2), + __wcschr_sse2) + IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcscmp.S. */ + IFUNC_IMPL (i, name, wcscmp, + IFUNC_IMPL_ADD (array, i, wcscmp, HAS_CPU_FEATURE (SSE2), + __wcscmp_sse2) + IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcscpy.S. */ + IFUNC_IMPL (i, name, wcscpy, + IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3), + __wcscpy_ssse3) + IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcslen.S. */ + IFUNC_IMPL (i, name, wcslen, + IFUNC_IMPL_ADD (array, i, wcslen, HAS_CPU_FEATURE (SSE2), + __wcslen_sse2) + IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcsrchr.S. */ + IFUNC_IMPL (i, name, wcsrchr, + IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_CPU_FEATURE (SSE2), + __wcsrchr_sse2) + IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wmemcmp.S. */ + IFUNC_IMPL (i, name, wmemcmp, + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_2), + __wmemcmp_sse4_2) + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3), + __wmemcmp_ssse3) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32)) + +#ifdef SHARED + /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S. */ + IFUNC_IMPL (i, name, __memcpy_chk, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __memcpy_chk_ssse3_rep) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __memcpy_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSE2), + __memcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memcpy.S. */ + IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3_rep) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2), + __memcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S. */ + IFUNC_IMPL (i, name, __mempcpy_chk, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __mempcpy_chk_ssse3_rep) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __mempcpy_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSE2), + __mempcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/mempcpy.S. */ + IFUNC_IMPL (i, name, mempcpy, + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), + __mempcpy_ssse3_rep) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), + __mempcpy_ssse3) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2), + __mempcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strlen.S. */ + IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2), + __strlen_sse2_bsf) + IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2), + __strlen_sse2) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncmp.S. */ + IFUNC_IMPL (i, name, strncmp, + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2), + __strncmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), + __strncmp_ssse3) + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_ia32)) +#endif + + return i; +} diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym new file mode 100644 index 0000000000..aebff9a4f9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym @@ -0,0 +1,11 @@ +#include <locale/localeinfo.h> +#include <langinfo.h> +#include <stddef.h> + +-- + +LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales) +LC_CTYPE +_NL_CTYPE_NONASCII_CASE +LOCALE_DATA_VALUES offsetof (struct __locale_data, values) +SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0]) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S new file mode 100644 index 0000000000..dd316486e6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S @@ -0,0 +1,502 @@ +/* Optimized memchr with sse2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + +# ifndef USE_AS_RAWMEMCHR +# define LEN STR2+4 +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); +# endif + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2_bsf +# endif + + .text +ENTRY (MEMCHR) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + +# ifndef USE_AS_RAWMEMCHR + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null_1) +# endif + mov %ecx, %eax + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %ecx + ja L(crosscache) + + movdqu (%eax), %xmm0 + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %ecx + test %ecx, %ecx + je L(unaligned_no_match_1) +/* Check which byte is a match. */ + bsf %ecx, %ecx + +# ifndef USE_AS_RAWMEMCHR + sub %ecx, %edx + jbe L(return_null_1) +# endif + add %ecx, %eax + ret + + .p2align 4 +L(unaligned_no_match_1): +# ifndef USE_AS_RAWMEMCHR + sub $16, %edx + jbe L(return_null_1) + PUSH (%edi) + lea 16(%eax), %edi + and $15, %eax + and $-16, %edi + add %eax, %edx +# else + lea 16(%eax), %edx + and $-16, %edx +# endif + jmp L(loop_prolog) + + .p2align 4 +L(return_null_1): + xor %eax, %eax + ret + +# ifndef USE_AS_RAWMEMCHR + CFI_POP (%edi) +# endif + + .p2align 4 +L(crosscache): +/* Handle unaligned string. */ + +# ifndef USE_AS_RAWMEMCHR + PUSH (%edi) + mov %eax, %edi + and $15, %ecx + and $-16, %edi + movdqa (%edi), %xmm0 +# else + mov %eax, %edx + and $15, %ecx + and $-16, %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + sub %eax, %edx + jbe L(return_null) + add %edi, %eax + add %ecx, %eax + RETURN +# else + add %edx, %eax + add %ecx, %eax + ret +# endif + + .p2align 4 +L(unaligned_no_match): +# ifndef USE_AS_RAWMEMCHR + /* Calculate the last acceptable address and check for possible + addition overflow by using satured math: + edx = ecx + edx + edx |= -(edx < ecx) */ + add %ecx, %edx + sbb %eax, %eax + or %eax, %edx + sub $16, %edx + jbe L(return_null) + add $16, %edi +# else + add $16, %edx +# endif + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + +# ifndef USE_AS_RAWMEMCHR + test $0x3f, %edi +# else + test $0x3f, %edx +# endif + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm3 +# else + movdqa 48(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + test %eax, %eax + jnz L(matches0) + +# ifndef USE_AS_RAWMEMCHR + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx +# else + and $-64, %edx +# endif + + .p2align 4 +L(align64_loop): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 +# else + movdqa (%edx), %xmm0 + movdqa 16(%edx), %xmm2 + movdqa 32(%edx), %xmm3 + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + + test %eax, %eax + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edi +# else + sub $64, %edx +# endif + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + + pcmpeqb %xmm1, %xmm3 + +# ifndef USE_AS_RAWMEMCHR + pcmpeqb 48(%edi), %xmm1 +# else + pcmpeqb 48(%edx), %xmm1 +# endif + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + lea 48(%edi, %eax), %eax + RETURN +# else + lea 48(%edx, %eax), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %eax, %eax + RETURN + + .p2align 4 +L(exit_loop_32): + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 16(%edi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %eax, %eax + RETURN +# endif + .p2align 4 +L(matches0): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea -16(%eax, %edi), %eax + RETURN +# else + lea -16(%eax, %edx), %eax + ret +# endif + + .p2align 4 +L(matches): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + add %edi, %eax + RETURN +# else + add %edx, %eax + ret +# endif + + .p2align 4 +L(matches16): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea 16(%eax, %edi), %eax + RETURN +# else + lea 16(%eax, %edx), %eax + ret +# endif + + .p2align 4 +L(matches32): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea 32(%eax, %edi), %eax + RETURN +# else + lea 32(%eax, %edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(matches_1): + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + add %edi, %eax + RETURN + + .p2align 4 +L(matches16_1): + sub $16, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 16(%edi, %eax), %eax + RETURN + + .p2align 4 +L(matches32_1): + sub $32, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 32(%edi, %eax), %eax + RETURN + + .p2align 4 +L(matches48_1): + sub $48, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 48(%edi, %eax), %eax + RETURN +# endif + .p2align 4 +L(return_null): + xor %eax, %eax +# ifndef USE_AS_RAWMEMCHR + RETURN +# else + ret +# endif + +END (MEMCHR) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S new file mode 100644 index 0000000000..172d70de13 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S @@ -0,0 +1,709 @@ +/* Optimized memchr with sse2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef USE_AS_RAWMEMCHR +# define ENTRANCE PUSH(%edi); +# define PARMS 8 +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); +# else +# define ENTRANCE +# define PARMS 4 +# endif + +# define STR1 PARMS +# define STR2 STR1+4 + +# ifndef USE_AS_RAWMEMCHR +# define LEN STR2+4 +# endif + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2 +# endif + + atom_text_section +ENTRY (MEMCHR) + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 +# ifndef USE_AS_RAWMEMCHR + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null) +# endif + + punpcklbw %xmm1, %xmm1 +# ifndef USE_AS_RAWMEMCHR + mov %ecx, %edi +# else + mov %ecx, %edx +# endif + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + cmp $48, %ecx + ja L(crosscache) + +# ifndef USE_AS_RAWMEMCHR + movdqu (%edi), %xmm0 +# else + movdqu (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax +# ifndef USE_AS_RAWMEMCHR + jnz L(match_case2_prolog) + + sub $16, %edx + jbe L(return_null) + lea 16(%edi), %edi + and $15, %ecx + and $-16, %edi + add %ecx, %edx +# else + jnz L(match_case1_prolog) + lea 16(%edx), %edx + and $-16, %edx +# endif + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx +# ifndef USE_AS_RAWMEMCHR + and $-16, %edi + movdqa (%edi), %xmm0 +# else + and $-16, %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + sar %cl, %eax + test %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + jnz L(match_case2_prolog1) + /* "ecx" is less than 16. Calculate "edx + ecx - 16" by using + "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void + possible addition overflow. */ + neg %ecx + add $16, %ecx + sub %ecx, %edx + jbe L(return_null) + lea 16(%edi), %edi +# else + jnz L(match_case1_prolog1) + lea 16(%edx), %edx +# endif + + .p2align 4 +L(loop_prolog): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + lea 64(%edi), %edi + sub $64, %edx + jbe L(exit_loop) + + movdqa (%edi), %xmm0 +# else + lea 64(%edx), %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + lea 64(%edi), %edi + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx +# else + lea 64(%edx), %edx + and $-64, %edx +# endif + + .p2align 4 +L(align64_loop): + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 +# else + movdqa (%edx), %xmm0 + movdqa 16(%edx), %xmm2 + movdqa 32(%edx), %xmm3 + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + pmovmskb %xmm4, %eax + + test %eax, %eax + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edi +# else + sub $64, %edx +# endif + + pmovmskb %xmm0, %eax + xor %ecx, %ecx + test %eax, %eax + jnz L(match_case1) + + pmovmskb %xmm2, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + pcmpeqb 48(%edi), %xmm1 +# else + pcmpeqb 48(%edx), %xmm1 +# endif + pmovmskb %xmm1, %eax + lea 16(%ecx), %ecx + + .p2align 4 +L(match_case1): +# ifndef USE_AS_RAWMEMCHR + add %ecx, %edi +# else +L(match_case1_prolog1): + add %ecx, %edx +L(match_case1_prolog): +# endif + test %al, %al + jz L(match_case1_high) + mov %al, %cl + and $15, %cl + jz L(match_case1_8) + test $0x01, %al + jnz L(ExitCase1_1) + test $0x02, %al + jnz L(ExitCase1_2) + test $0x04, %al + jnz L(ExitCase1_3) +# ifndef USE_AS_RAWMEMCHR + lea 3(%edi), %eax + RETURN +# else + lea 3(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_8): + test $0x10, %al + jnz L(ExitCase1_5) + test $0x20, %al + jnz L(ExitCase1_6) + test $0x40, %al + jnz L(ExitCase1_7) +# ifndef USE_AS_RAWMEMCHR + lea 7(%edi), %eax + RETURN +# else + lea 7(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_high): + mov %ah, %ch + and $15, %ch + jz L(match_case1_high_8) + test $0x01, %ah + jnz L(ExitCase1_9) + test $0x02, %ah + jnz L(ExitCase1_10) + test $0x04, %ah + jnz L(ExitCase1_11) +# ifndef USE_AS_RAWMEMCHR + lea 11(%edi), %eax + RETURN +# else + lea 11(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_high_8): + test $0x10, %ah + jnz L(ExitCase1_13) + test $0x20, %ah + jnz L(ExitCase1_14) + test $0x40, %ah + jnz L(ExitCase1_15) +# ifndef USE_AS_RAWMEMCHR + lea 15(%edi), %eax + RETURN +# else + lea 15(%edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(exit_loop): + add $64, %edx + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case2) + cmp $16, %edx + jbe L(return_null) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case2) + cmp $32, %edx + jbe L(return_null) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case2) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + lea 16(%ecx), %ecx + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(match_case2) + + xor %eax, %eax + RETURN +# endif + + .p2align 4 +L(ExitCase1_1): +# ifndef USE_AS_RAWMEMCHR + mov %edi, %eax + RETURN +# else + mov %edx, %eax + ret +# endif + + .p2align 4 +L(ExitCase1_2): +# ifndef USE_AS_RAWMEMCHR + lea 1(%edi), %eax + RETURN +# else + lea 1(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_3): +# ifndef USE_AS_RAWMEMCHR + lea 2(%edi), %eax + RETURN +# else + lea 2(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_5): +# ifndef USE_AS_RAWMEMCHR + lea 4(%edi), %eax + RETURN +# else + lea 4(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_6): +# ifndef USE_AS_RAWMEMCHR + lea 5(%edi), %eax + RETURN +# else + lea 5(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_7): +# ifndef USE_AS_RAWMEMCHR + lea 6(%edi), %eax + RETURN +# else + lea 6(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_9): +# ifndef USE_AS_RAWMEMCHR + lea 8(%edi), %eax + RETURN +# else + lea 8(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_10): +# ifndef USE_AS_RAWMEMCHR + lea 9(%edi), %eax + RETURN +# else + lea 9(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_11): +# ifndef USE_AS_RAWMEMCHR + lea 10(%edi), %eax + RETURN +# else + lea 10(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_13): +# ifndef USE_AS_RAWMEMCHR + lea 12(%edi), %eax + RETURN +# else + lea 12(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_14): +# ifndef USE_AS_RAWMEMCHR + lea 13(%edi), %eax + RETURN +# else + lea 13(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_15): +# ifndef USE_AS_RAWMEMCHR + lea 14(%edi), %eax + RETURN +# else + lea 14(%edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(match_case2): + sub %ecx, %edx +L(match_case2_prolog1): + add %ecx, %edi +L(match_case2_prolog): + test %al, %al + jz L(match_case2_high) + mov %al, %cl + and $15, %cl + jz L(match_case2_8) + test $0x01, %al + jnz L(ExitCase2_1) + test $0x02, %al + jnz L(ExitCase2_2) + test $0x04, %al + jnz L(ExitCase2_3) + sub $4, %edx + jb L(return_null) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_8): + test $0x10, %al + jnz L(ExitCase2_5) + test $0x20, %al + jnz L(ExitCase2_6) + test $0x40, %al + jnz L(ExitCase2_7) + sub $8, %edx + jb L(return_null) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high): + mov %ah, %ch + and $15, %ch + jz L(match_case2_high_8) + test $0x01, %ah + jnz L(ExitCase2_9) + test $0x02, %ah + jnz L(ExitCase2_10) + test $0x04, %ah + jnz L(ExitCase2_11) + sub $12, %edx + jb L(return_null) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high_8): + test $0x10, %ah + jnz L(ExitCase2_13) + test $0x20, %ah + jnz L(ExitCase2_14) + test $0x40, %ah + jnz L(ExitCase2_15) + sub $16, %edx + jb L(return_null) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_1): + mov %edi, %eax + RETURN + + .p2align 4 +L(ExitCase2_2): + sub $2, %edx + jb L(return_null) + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_3): + sub $3, %edx + jb L(return_null) + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_5): + sub $5, %edx + jb L(return_null) + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_6): + sub $6, %edx + jb L(return_null) + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_7): + sub $7, %edx + jb L(return_null) + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_9): + sub $9, %edx + jb L(return_null) + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_10): + sub $10, %edx + jb L(return_null) + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_11): + sub $11, %edx + jb L(return_null) + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_13): + sub $13, %edx + jb L(return_null) + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_14): + sub $14, %edx + jb L(return_null) + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_15): + sub $15, %edx + jb L(return_null) + lea 14(%edi), %eax + RETURN +# endif + + .p2align 4 +L(return_null): + xor %eax, %eax +# ifndef USE_AS_RAWMEMCHR + RETURN +# else + ret +# endif + +END (MEMCHR) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S new file mode 100644 index 0000000000..bd0dace290 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S @@ -0,0 +1,65 @@ +/* Multiple versions of memchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__memchr) + .type __memchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX ( __memchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__memchr_ia32) + ret + +3: LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf) + ret +END(__memchr) + +weak_alias(__memchr, memchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __memchr_ia32, @function; \ + .globl __memchr_ia32; \ + .p2align 4; \ + __memchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memchr_ia32, .-__memchr_ia32 + +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memchr; __GI_memchr = __memchr_ia32 + +#endif +#include "../../memchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S new file mode 100644 index 0000000000..2aa13048b2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S @@ -0,0 +1,1225 @@ +/* memcmp with SSE4.2, wmemcmp with SSE4.2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_2 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1 + 4 +# define LEN BLK2 + 4 +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) + + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +/* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ +/* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ +/* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ +/* We loaded the jump table and adjusted EDX/ESI. Go. */ \ + jmp *%ebx +# else +# define JMPTBL(I, B) I + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.sse4.2,"ax",@progbits +ENTRY (MEMCMP) + movl BLK1(%esp), %eax + movl BLK2(%esp), %edx + movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(return0) +# else + cmp $1, %ecx + jbe L(less1bytes) +# endif + + pxor %xmm0, %xmm0 + cmp $64, %ecx + ja L(64bytesormore) + cmp $8, %ecx + +# ifndef USE_AS_WMEMCMP + PUSH (%ebx) + jb L(less8bytes) +# else + jb L(less8bytes) + PUSH (%ebx) +# endif + + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less8bytes): + mov (%eax), %bl + cmpb (%edx), %bl + jne L(nonzero) + + mov 1(%eax), %bl + cmpb 1(%edx), %bl + jne L(nonzero) + + cmp $2, %ecx + jz L(0bytes) + + mov 2(%eax), %bl + cmpb 2(%edx), %bl + jne L(nonzero) + + cmp $3, %ecx + jz L(0bytes) + + mov 3(%eax), %bl + cmpb 3(%edx), %bl + jne L(nonzero) + + cmp $4, %ecx + jz L(0bytes) + + mov 4(%eax), %bl + cmpb 4(%edx), %bl + jne L(nonzero) + + cmp $5, %ecx + jz L(0bytes) + + mov 5(%eax), %bl + cmpb 5(%edx), %bl + jne L(nonzero) + + cmp $6, %ecx + jz L(0bytes) + + mov 6(%eax), %bl + cmpb 6(%edx), %bl + je L(0bytes) + +L(nonzero): + POP (%ebx) + mov $1, %eax + ja L(above) + neg %eax +L(above): + ret + CFI_PUSH (%ebx) +# endif + + .p2align 4 +L(0bytes): + POP (%ebx) + xor %eax, %eax + ret + +# ifdef USE_AS_WMEMCMP + +/* for wmemcmp, case N == 1 */ + + .p2align 4 +L(less8bytes): + mov (%eax), %ecx + cmp (%edx), %ecx + je L(return0) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + + .p2align 4 +L(return0): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less1bytes): + jb L(0bytesend) + movzbl (%eax), %eax + movzbl (%edx), %edx + sub %edx, %eax + ret + + .p2align 4 +L(0bytesend): + xor %eax, %eax + ret +# endif + .p2align 4 +L(64bytesormore): + PUSH (%ebx) + mov %ecx, %ebx + mov $64, %ecx + sub $64, %ebx +L(64bytesormore_loop): + movdqu (%eax), %xmm1 + movdqu (%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_16diff) + + movdqu 16(%eax), %xmm1 + movdqu 16(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_32diff) + + movdqu 32(%eax), %xmm1 + movdqu 32(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_48diff) + + movdqu 48(%eax), %xmm1 + movdqu 48(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_64diff) + add %ecx, %eax + add %ecx, %edx + sub %ecx, %ebx + jae L(64bytesormore_loop) + add %ebx, %ecx + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + +# ifdef USE_AS_WMEMCMP + +/* Label needs only for table_64bytes filling */ +L(unreal_case): +/* no code here */ + +# endif + .p2align 4 +L(find_16diff): + sub $16, %ecx +L(find_32diff): + sub $16, %ecx +L(find_48diff): + sub $16, %ecx +L(find_64diff): + add %ecx, %edx + add %ecx, %eax + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# else + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + cmp -4(%edx), %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(49bytes): + movdqu -49(%eax), %xmm1 + movdqu -49(%edx), %xmm2 + mov $-49, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%eax), %xmm1 + movdqu -33(%edx), %xmm2 + mov $-33, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(50bytes): + mov $-50, %ebx + movdqu -50(%eax), %xmm1 + movdqu -50(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + mov $-34, %ebx + movdqu -34(%eax), %xmm1 + movdqu -34(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(51bytes): + mov $-51, %ebx + movdqu -51(%eax), %xmm1 + movdqu -51(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + mov $-35, %ebx + movdqu -35(%eax), %xmm1 + movdqu -35(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) +L(1bytes): + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(52bytes): + movdqu -52(%eax), %xmm1 + movdqu -52(%edx), %xmm2 + mov $-52, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%eax), %xmm1 + movdqu -36(%edx), %xmm2 + mov $-36, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%eax), %xmm1 + movdqu -20(%edx), %xmm2 + mov $-20, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(53bytes): + movdqu -53(%eax), %xmm1 + movdqu -53(%edx), %xmm2 + mov $-53, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + mov $-37, %ebx + movdqu -37(%eax), %xmm1 + movdqu -37(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + mov $-21, %ebx + movdqu -21(%eax), %xmm1 + movdqu -21(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(54bytes): + movdqu -54(%eax), %xmm1 + movdqu -54(%edx), %xmm2 + mov $-54, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + mov $-38, %ebx + movdqu -38(%eax), %xmm1 + movdqu -38(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + mov $-22, %ebx + movdqu -22(%eax), %xmm1 + movdqu -22(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(55bytes): + movdqu -55(%eax), %xmm1 + movdqu -55(%edx), %xmm2 + mov $-55, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + mov $-39, %ebx + movdqu -39(%eax), %xmm1 + movdqu -39(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + mov $-23, %ebx + movdqu -23(%eax), %xmm1 + movdqu -23(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(56bytes): + movdqu -56(%eax), %xmm1 + movdqu -56(%edx), %xmm2 + mov $-56, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + mov $-40, %ebx + movdqu -40(%eax), %xmm1 + movdqu -40(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + mov $-24, %ebx + movdqu -24(%eax), %xmm1 + movdqu -24(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(57bytes): + movdqu -57(%eax), %xmm1 + movdqu -57(%edx), %xmm2 + mov $-57, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + mov $-41, %ebx + movdqu -41(%eax), %xmm1 + movdqu -41(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + mov $-25, %ebx + movdqu -25(%eax), %xmm1 + movdqu -25(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(58bytes): + movdqu -58(%eax), %xmm1 + movdqu -58(%edx), %xmm2 + mov $-58, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + mov $-42, %ebx + movdqu -42(%eax), %xmm1 + movdqu -42(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + mov $-26, %ebx + movdqu -26(%eax), %xmm1 + movdqu -26(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(59bytes): + movdqu -59(%eax), %xmm1 + movdqu -59(%edx), %xmm2 + mov $-59, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + mov $-43, %ebx + movdqu -43(%eax), %xmm1 + movdqu -43(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + mov $-27, %ebx + movdqu -27(%eax), %xmm1 + movdqu -27(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(60bytes): + movdqu -60(%eax), %xmm1 + movdqu -60(%edx), %xmm2 + mov $-60, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + mov $-44, %ebx + movdqu -44(%eax), %xmm1 + movdqu -44(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + mov $-28, %ebx + movdqu -28(%eax), %xmm1 + movdqu -28(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -12(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif + jne L(find_diff) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(61bytes): + movdqu -61(%eax), %xmm1 + movdqu -61(%edx), %xmm2 + mov $-61, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + mov $-45, %ebx + movdqu -45(%eax), %xmm1 + movdqu -45(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + mov $-29, %ebx + movdqu -29(%eax), %xmm1 + movdqu -29(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(62bytes): + movdqu -62(%eax), %xmm1 + movdqu -62(%edx), %xmm2 + mov $-62, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + mov $-46, %ebx + movdqu -46(%eax), %xmm1 + movdqu -46(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + mov $-30, %ebx + movdqu -30(%eax), %xmm1 + movdqu -30(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(63bytes): + movdqu -63(%eax), %xmm1 + movdqu -63(%edx), %xmm2 + mov $-63, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + mov $-47, %ebx + movdqu -47(%eax), %xmm1 + movdqu -47(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + mov $-31, %ebx + movdqu -31(%eax), %xmm1 + movdqu -31(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + + .p2align 4 +L(64bytes): + movdqu -64(%eax), %xmm1 + movdqu -64(%edx), %xmm2 + mov $-64, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%eax), %xmm1 + movdqu -48(%edx), %xmm2 + mov $-48, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%eax), %xmm1 + movdqu -32(%edx), %xmm2 + mov $-32, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -16(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -16(%edx), %ecx +# endif + jne L(find_diff) + + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -12(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif + jne L(find_diff) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + mov (%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + mov 4(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + mov 8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + mov 12(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# else + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + cmp (%edx), %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + cmp 4(%edx), %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + cmp 8(%edx), %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + cmp 12(%edx), %ecx + + mov $0, %eax + jne L(find_diff) + RETURN +# endif + + .p2align 4 +L(find_diff): +# ifndef USE_AS_WMEMCMP + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret +# else + POP (%ebx) + mov $1, %eax + jg L(bigger) + neg %eax + ret + + .p2align 4 +L(bigger): + ret +# endif +END (MEMCMP) + + .section .rodata.sse4.2,"a",@progbits + .p2align 2 + .type L(table_64bytes), @object +# ifndef USE_AS_WMEMCMP +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000000..5ebf5a4d73 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -0,0 +1,2157 @@ +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1+4 +# define LEN BLK2+4 +# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + atom_text_section +ENTRY (MEMCMP) + movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(zero) +# endif + + movl BLK1(%esp), %eax + cmp $48, %ecx + movl BLK2(%esp), %edx + jae L(48bytesormore) + +# ifndef USE_AS_WMEMCMP + cmp $1, %ecx + jbe L(less1bytes) +# endif + + PUSH (%ebx) + add %ecx, %edx + add %ecx, %eax + jmp L(less48bytes) + + CFI_POP (%ebx) + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less1bytes): + jb L(zero) + movb (%eax), %cl + cmp (%edx), %cl + je L(zero) + mov $1, %eax + ja L(1bytesend) + neg %eax +L(1bytesend): + ret +# endif + + .p2align 4 +L(zero): + xor %eax, %eax + ret + + .p2align 4 +L(48bytesormore): + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) + cfi_remember_state + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 + movl %eax, %edi + movl %edx, %esi + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%edi), %edi + + sub $0xffff, %edx + lea 16(%esi), %esi + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %edx, %edi + sub %edx, %esi + add %edx, %ecx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %edx, %esi + +# ifndef USE_AS_WMEMCMP + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + .p2align 2 +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif + + .p2align 4 +L(shr_0): + cmp $80, %ecx + jae L(shr_0_gobble) + lea -48(%ecx), %ecx + xor %eax, %eax + movaps (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + movaps 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + add $32, %edi + add $32, %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_0_gobble): + lea -48(%ecx), %ecx + movdqa (%esi), %xmm0 + xor %eax, %eax + pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %ecx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%esi), %xmm0 + movdqa 48(%esi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%edi), %xmm0 + pcmpeqb 48(%edi), %xmm2 + lea 32(%edi), %edi + lea 32(%esi), %esi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %ecx + jge L(shr_0_gobble_loop_next) + inc %edx + add $32, %ecx +L(shr_0_gobble_loop_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_1): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $1,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_1_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $1,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $1,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $1,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $1,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_1_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_1_gobble_next) + inc %edx + add $32, %ecx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_2): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $2,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_2_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $2,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $2,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $2,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $2,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_2_gobble_next) + inc %edx + add $32, %ecx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_3): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $3,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_3_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $3,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $3,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $3,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $3,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_3_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_3_gobble_next) + inc %edx + add $32, %ecx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_4): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $4,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_4_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $4,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $4,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $4,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $4,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_4_gobble_next) + inc %edx + add $32, %ecx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_5): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $5,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_5_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $5,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $5,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $5,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $5,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_5_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_5_gobble_next) + inc %edx + add $32, %ecx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_6): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $6,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_6_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $6,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $6,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $6,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $6,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_6_gobble_next) + inc %edx + add $32, %ecx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_7): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $7,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_7_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $7,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $7,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $7,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $7,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_7_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_7_gobble_next) + inc %edx + add $32, %ecx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_8): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $8,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_8_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $8,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $8,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $8,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $8,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_8_gobble_next) + inc %edx + add $32, %ecx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_9): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $9,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_9_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $9,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $9,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $9,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $9,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_9_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_9_gobble_next) + inc %edx + add $32, %ecx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_10): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $10,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_10_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $10, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $10, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $10,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $10,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_10_gobble_next) + inc %edx + add $32, %ecx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_11): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_11_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $11, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $11, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $11,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $11,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_11_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_11_gobble_next) + inc %edx + add $32, %ecx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_12): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_12_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $12, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $12, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $12,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $12,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_12_gobble_next) + inc %edx + add $32, %ecx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_13): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_13_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $13, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $13, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $13,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $13,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_13_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_13_gobble_next) + inc %edx + add $32, %ecx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_14): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_14_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $14, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $14, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $14,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $14,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_14_gobble_next) + inc %edx + add $32, %ecx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_15): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_15_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $15, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $15, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $15,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $15,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_15_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_15_gobble_next) + inc %edx + add $32, %ecx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(exit): + pmovmskb %xmm1, %ebx + sub $0xffff, %ebx + jz L(first16bytes) + lea -16(%esi), %esi + lea -16(%edi), %edi + mov %ebx, %edx + +L(first16bytes): + add %eax, %esi +L(less16bytes): + +# ifndef USE_AS_WMEMCMP + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) +L(Byte23): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte16): + movzbl -16(%edi), %eax + movzbl -16(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte17): + movzbl -15(%edi), %eax + movzbl -15(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte18): + movzbl -14(%edi), %eax + movzbl -14(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte19): + movzbl -13(%edi), %eax + movzbl -13(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte20): + movzbl -12(%edi), %eax + movzbl -12(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte21): + movzbl -11(%edi), %eax + movzbl -11(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte22): + movzbl -10(%edi), %eax + movzbl -10(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(next_24_bytes): + lea 8(%edi), %edi + lea 8(%esi), %esi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + .p2align 4 +L(Byte31): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN_END +# else + +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%edi), %eax + cmp -16(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov -12(%edi), %eax + cmp -12(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%edi), %eax + cmp -8(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov -4(%edi), %eax + cmp -4(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(nequal_bigger) + neg %eax + RETURN + + .p2align 4 +L(nequal_bigger): + RETURN_END +# endif + + CFI_PUSH (%ebx) + + .p2align 4 +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) +# ifndef USE_AS_WMEMCMP + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) +# else + jmp L(12bytes) +# endif + + .p2align 4 +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) +# ifndef USE_AS_WMEMCMP + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) +# else + jmp L(20bytes) +# endif + + .p2align 4 +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) +# ifndef USE_AS_WMEMCMP + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) +# else + jmp L(28bytes) +# endif + + .p2align 4 +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) +# ifndef USE_AS_WMEMCMP + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) +# ifndef USE_AS_WMEMCMP + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif + + .p2align 4 +L(more40bytes): + cmp $40, %ecx + je L(40bytes) +# ifndef USE_AS_WMEMCMP + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + mov -44(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + mov -40(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + mov -36(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + mov -32(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + mov -28(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + mov -24(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + mov -20(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# else + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + cmp -44(%edx), %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + cmp -40(%edx), %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + cmp -36(%edx), %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + cmp -32(%edx), %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + cmp -28(%edx), %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + cmp -24(%edx), %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + cmp -20(%edx), %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + xor %eax, %eax + cmp -4(%edx), %ecx + jne L(find_diff) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# endif + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(45bytes): + mov -45(%eax), %ecx + mov -45(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(41bytes): + mov -41(%eax), %ecx + mov -41(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(37bytes): + mov -37(%eax), %ecx + mov -37(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(33bytes): + mov -33(%eax), %ecx + mov -33(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(29bytes): + mov -29(%eax), %ecx + mov -29(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(25bytes): + mov -25(%eax), %ecx + mov -25(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(21bytes): + mov -21(%eax), %ecx + mov -21(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(46bytes): + mov -46(%eax), %ecx + mov -46(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(42bytes): + mov -42(%eax), %ecx + mov -42(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(38bytes): + mov -38(%eax), %ecx + mov -38(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(34bytes): + mov -34(%eax), %ecx + mov -34(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(30bytes): + mov -30(%eax), %ecx + mov -30(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(26bytes): + mov -26(%eax), %ecx + mov -26(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(22bytes): + mov -22(%eax), %ecx + mov -22(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(47bytes): + movl -47(%eax), %ecx + movl -47(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(43bytes): + movl -43(%eax), %ecx + movl -43(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(39bytes): + movl -39(%eax), %ecx + movl -39(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(35bytes): + movl -35(%eax), %ecx + movl -35(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(31bytes): + movl -31(%eax), %ecx + movl -31(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(27bytes): + movl -27(%eax), %ecx + movl -27(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(23bytes): + movl -23(%eax), %ecx + movl -23(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(find_diff): + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx + + .p2align 4 +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret +# else + +/* for wmemcmp */ + .p2align 4 +L(find_diff): + POP (%ebx) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + +# endif +END (MEMCMP) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S new file mode 100644 index 0000000000..1fc5994a17 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S @@ -0,0 +1,62 @@ +/* Multiple versions of memcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memcmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memcmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (__memcmp_sse4_2) +2: ret +END(memcmp) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcmp_ia32, @function; \ + .p2align 4; \ + .globl __memcmp_ia32; \ + .hidden __memcmp_ia32; \ + __memcmp_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcmp; __GI_memcmp = __memcmp_ia32 +# endif +#endif + +#include "../memcmp.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S new file mode 100644 index 0000000000..2fe2072cb1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S @@ -0,0 +1,681 @@ +/* memcpy optimized with SSE2 unaligned memory access instructions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +# include <sysdep.h> +# include "asm-syntax.h" + +# ifndef MEMCPY +# define MEMCPY __memcpy_sse2_unaligned +# define MEMCPY_CHK __memcpy_chk_sse2_unaligned +# endif + +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) + + .section .text.sse2,"ax",@progbits +# if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +# endif + +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + cmp %edx, %eax + +# ifdef USE_AS_MEMMOVE + jg L(check_forward) + +L(mm_len_0_or_more_backward): +/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] + separately. */ + cmp $16, %ecx + jbe L(mm_len_0_16_bytes_backward) + + cmpl $32, %ecx + jg L(mm_len_32_or_more_backward) + +/* Copy [0..32] and return. */ + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_32_or_more_backward): + cmpl $64, %ecx + jg L(mm_len_64_or_more_backward) + +/* Copy [0..64] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu -16(%eax, %ecx), %xmm2 + movdqu -32(%eax, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -16(%edx, %ecx) + movdqu %xmm3, -32(%edx, %ecx) + jmp L(return) + +L(mm_len_64_or_more_backward): + cmpl $128, %ecx + jg L(mm_len_128_or_more_backward) + +/* Copy [0..128] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_128_or_more_backward): + add %ecx, %eax + cmp %edx, %eax + movl SRC(%esp), %eax + jle L(forward) + PUSH (%esi) + PUSH (%edi) + PUSH (%ebx) + +/* Aligning the address of destination. */ + movdqu (%eax), %xmm4 + movdqu 16(%eax), %xmm5 + movdqu 32(%eax), %xmm6 + movdqu 48(%eax), %xmm7 + leal (%edx, %ecx), %esi + movdqu -16(%eax, %ecx), %xmm0 + subl $16, %esp + movdqu %xmm0, (%esp) + mov %ecx, %edi + movl %esi, %ecx + andl $-16, %ecx + leal (%ecx), %ebx + subl %edx, %ebx + leal (%eax, %ebx), %eax + shrl $6, %ebx + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %edi +# else +# ifdef SHARED + PUSH (%ebx) + SETUP_PIC_REG (bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi + POP (%ebx) +# else + cmp __x86_shared_cache_size_half, %edi +# endif +# endif + jae L(mm_large_page_loop_backward) + + .p2align 4 +L(mm_main_loop_backward): + + prefetcht0 -128(%eax) + + movdqu -64(%eax), %xmm0 + movdqu -48(%eax), %xmm1 + movdqu -32(%eax), %xmm2 + movdqu -16(%eax), %xmm3 + movaps %xmm0, -64(%ecx) + subl $64, %eax + movaps %xmm1, -48(%ecx) + movaps %xmm2, -32(%ecx) + movaps %xmm3, -16(%ecx) + subl $64, %ecx + sub $1, %ebx + jnz L(mm_main_loop_backward) + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, -16(%esi) + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + POP (%ebx) + jmp L(mm_return_pop_all) + +/* Copy [0..16] and return. */ +L(mm_len_0_16_bytes_backward): + testb $24, %cl + jnz L(mm_len_9_16_bytes_backward) + testb $4, %cl + .p2align 4,,5 + jnz L(mm_len_5_8_bytes_backward) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + testb $2, %cl + .p2align 4,,1 + jne L(mm_len_3_4_bytes_backward) + movzbl -1(%eax,%ecx), %ebx + movzbl (%eax), %eax + movb %bl, -1(%edx,%ecx) + movb %al, (%edx) + jmp L(return) + +L(mm_len_3_4_bytes_backward): + movzwl -2(%eax,%ecx), %ebx + movzwl (%eax), %eax + movw %bx, -2(%edx,%ecx) + movw %ax, (%edx) + jmp L(return) + +L(mm_len_9_16_bytes_backward): + PUSH (%esi) + movl -4(%eax,%ecx), %ebx + movl -8(%eax,%ecx), %esi + movl %ebx, -4(%edx,%ecx) + movl %esi, -8(%edx,%ecx) + subl $8, %ecx + POP (%esi) + jmp L(mm_len_0_16_bytes_backward) + +L(mm_len_5_8_bytes_backward): + movl (%eax), %ebx + movl -4(%eax,%ecx), %eax + movl %ebx, (%edx) + movl %eax, -4(%edx,%ecx) + jmp L(return) + +/* Big length copy backward part. */ + .p2align 4 +L(mm_large_page_loop_backward): + movdqu -64(%eax), %xmm0 + movdqu -48(%eax), %xmm1 + movdqu -32(%eax), %xmm2 + movdqu -16(%eax), %xmm3 + movntdq %xmm0, -64(%ecx) + subl $64, %eax + movntdq %xmm1, -48(%ecx) + movntdq %xmm2, -32(%ecx) + movntdq %xmm3, -16(%ecx) + subl $64, %ecx + sub $1, %ebx + jnz L(mm_large_page_loop_backward) + sfence + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, -16(%esi) + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + POP (%ebx) + jmp L(mm_return_pop_all) + +L(check_forward): + add %edx, %ecx + cmp %eax, %ecx + movl LEN(%esp), %ecx + jle L(forward) + +/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] + separately. */ + cmp $16, %ecx + jbe L(mm_len_0_16_bytes_forward) + + cmpl $32, %ecx + ja L(mm_len_32_or_more_forward) + +/* Copy [0..32] and return. */ + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_32_or_more_forward): + cmpl $64, %ecx + ja L(mm_len_64_or_more_forward) + +/* Copy [0..64] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu -16(%eax, %ecx), %xmm2 + movdqu -32(%eax, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -16(%edx, %ecx) + movdqu %xmm3, -32(%edx, %ecx) + jmp L(return) + +L(mm_len_64_or_more_forward): + cmpl $128, %ecx + ja L(mm_len_128_or_more_forward) + +/* Copy [0..128] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_128_or_more_forward): + PUSH (%esi) + PUSH (%edi) + PUSH (%ebx) + +/* Aligning the address of destination. */ + movdqu -16(%eax, %ecx), %xmm4 + movdqu -32(%eax, %ecx), %xmm5 + movdqu -48(%eax, %ecx), %xmm6 + movdqu -64(%eax, %ecx), %xmm7 + leal (%edx, %ecx), %esi + movdqu (%eax), %xmm0 + subl $16, %esp + movdqu %xmm0, (%esp) + mov %ecx, %edi + leal 16(%edx), %ecx + andl $-16, %ecx + movl %ecx, %ebx + subl %edx, %ebx + addl %ebx, %eax + movl %esi, %ebx + subl %ecx, %ebx + shrl $6, %ebx + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %edi +# else +# ifdef SHARED + PUSH (%ebx) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi + POP (%ebx) +# else + cmp __x86_shared_cache_size_half, %edi +# endif +# endif + jae L(mm_large_page_loop_forward) + + .p2align 4 +L(mm_main_loop_forward): + + prefetcht0 128(%eax) + + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqa %xmm0, (%ecx) + addl $64, %eax + movaps %xmm1, 16(%ecx) + movaps %xmm2, 32(%ecx) + movaps %xmm3, 48(%ecx) + addl $64, %ecx + sub $1, %ebx + jnz L(mm_main_loop_forward) + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, (%edx) + movdqu %xmm4, -16(%esi) + movdqu %xmm5, -32(%esi) + movdqu %xmm6, -48(%esi) + movdqu %xmm7, -64(%esi) + POP (%ebx) + jmp L(mm_return_pop_all) + +L(mm_len_0_16_bytes_forward): + testb $24, %cl + jne L(mm_len_9_16_bytes_forward) + testb $4, %cl + .p2align 4,,5 + jne L(mm_len_5_8_bytes_forward) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + testb $2, %cl + .p2align 4,,1 + jne L(mm_len_2_4_bytes_forward) + movzbl -1(%eax,%ecx), %ebx + movzbl (%eax), %eax + movb %bl, -1(%edx,%ecx) + movb %al, (%edx) + jmp L(return) + +L(mm_len_2_4_bytes_forward): + movzwl -2(%eax,%ecx), %ebx + movzwl (%eax), %eax + movw %bx, -2(%edx,%ecx) + movw %ax, (%edx) + jmp L(return) + +L(mm_len_5_8_bytes_forward): + movl (%eax), %ebx + movl -4(%eax,%ecx), %eax + movl %ebx, (%edx) + movl %eax, -4(%edx,%ecx) + jmp L(return) + +L(mm_len_9_16_bytes_forward): + movq (%eax), %xmm0 + movq -8(%eax, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(return) + +L(mm_return_pop_all): + movl %edx, %eax + POP (%edi) + POP (%esi) + RETURN + +/* Big length copy forward part. */ + .p2align 4 +L(mm_large_page_loop_forward): + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movntdq %xmm0, (%ecx) + addl $64, %eax + movntdq %xmm1, 16(%ecx) + movntdq %xmm2, 32(%ecx) + movntdq %xmm3, 48(%ecx) + addl $64, %ecx + sub $1, %ebx + jnz L(mm_large_page_loop_forward) + sfence + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, (%edx) + movdqu %xmm4, -16(%esi) + movdqu %xmm5, -32(%esi) + movdqu %xmm6, -48(%esi) + movdqu %xmm7, -64(%esi) + POP (%ebx) + jmp L(mm_return_pop_all) +# endif + +L(forward): + cmp $16, %ecx + jbe L(len_0_16_bytes) + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +# endif + jae L(large_page) + + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + cmpl $32, %ecx + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jbe L(return) + + movdqu 16(%eax), %xmm0 + movdqu -32(%eax, %ecx), %xmm1 + cmpl $64, %ecx + movdqu %xmm0, 16(%edx) + movdqu %xmm1, -32(%edx, %ecx) + jbe L(return) + + movdqu 32(%eax), %xmm0 + movdqu 48(%eax), %xmm1 + movdqu -48(%eax, %ecx), %xmm2 + movdqu -64(%eax, %ecx), %xmm3 + cmpl $128, %ecx + movdqu %xmm0, 32(%edx) + movdqu %xmm1, 48(%edx) + movdqu %xmm2, -48(%edx, %ecx) + movdqu %xmm3, -64(%edx, %ecx) + jbe L(return) + +/* Now the main loop: we align the address of the destination. */ + leal 64(%edx), %ebx + andl $-64, %ebx + + addl %edx, %ecx + andl $-64, %ecx + + subl %edx, %eax + +/* We should stop two iterations before the termination + (in order not to misprefetch). */ + subl $64, %ecx + cmpl %ebx, %ecx + je L(main_loop_just_one_iteration) + + subl $64, %ecx + cmpl %ebx, %ecx + je L(main_loop_last_two_iterations) + + .p2align 4 +L(main_loop_cache): + + prefetcht0 128(%ebx, %eax) + + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + lea 64(%ebx), %ebx + cmpl %ebx, %ecx + jne L(main_loop_cache) + +L(main_loop_last_two_iterations): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqu 64(%ebx, %eax), %xmm4 + movdqu 80(%ebx, %eax), %xmm5 + movdqu 96(%ebx, %eax), %xmm6 + movdqu 112(%ebx, %eax), %xmm7 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + movaps %xmm4, 64(%ebx) + movaps %xmm5, 80(%ebx) + movaps %xmm6, 96(%ebx) + movaps %xmm7, 112(%ebx) + jmp L(return) + +L(main_loop_just_one_iteration): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + jmp L(return) + +L(large_page): + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + + movdqu 64(%eax), %xmm0 + movdqu 80(%eax), %xmm1 + movdqu 96(%eax), %xmm2 + movdqu 112(%eax), %xmm3 + movdqu -128(%eax, %ecx), %xmm4 + movdqu -112(%eax, %ecx), %xmm5 + movdqu -96(%eax, %ecx), %xmm6 + movdqu -80(%eax, %ecx), %xmm7 + movdqu %xmm0, 64(%edx) + movdqu %xmm1, 80(%edx) + movdqu %xmm2, 96(%edx) + movdqu %xmm3, 112(%edx) + movdqu %xmm4, -128(%edx, %ecx) + movdqu %xmm5, -112(%edx, %ecx) + movdqu %xmm6, -96(%edx, %ecx) + movdqu %xmm7, -80(%edx, %ecx) + +/* Now the main loop with non temporal stores. We align + the address of the destination. */ + leal 128(%edx), %ebx + andl $-128, %ebx + + addl %edx, %ecx + andl $-128, %ecx + + subl %edx, %eax + + .p2align 4 +L(main_loop_large_page): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqu 64(%ebx, %eax), %xmm4 + movdqu 80(%ebx, %eax), %xmm5 + movdqu 96(%ebx, %eax), %xmm6 + movdqu 112(%ebx, %eax), %xmm7 + movntdq %xmm0, (%ebx) + movntdq %xmm1, 16(%ebx) + movntdq %xmm2, 32(%ebx) + movntdq %xmm3, 48(%ebx) + movntdq %xmm4, 64(%ebx) + movntdq %xmm5, 80(%ebx) + movntdq %xmm6, 96(%ebx) + movntdq %xmm7, 112(%ebx) + lea 128(%ebx), %ebx + cmpl %ebx, %ecx + jne L(main_loop_large_page) + sfence + jmp L(return) + +L(len_0_16_bytes): + testb $24, %cl + jne L(len_9_16_bytes) + testb $4, %cl + .p2align 4,,5 + jne L(len_5_8_bytes) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + movzbl (%eax), %ebx + testb $2, %cl + movb %bl, (%edx) + je L(return) + movzwl -2(%eax,%ecx), %ebx + movw %bx, -2(%edx,%ecx) + jmp L(return) + +L(len_9_16_bytes): + movq (%eax), %xmm0 + movq -8(%eax, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(return) + +L(len_5_8_bytes): + movl (%eax), %ebx + movl %ebx, (%edx) + movl -4(%eax,%ecx), %ebx + movl %ebx, -4(%edx,%ecx) + +L(return): + movl %edx, %eax +# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif + RETURN + +END (MEMCPY) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S new file mode 100644 index 0000000000..687e083147 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S @@ -0,0 +1,1809 @@ +/* memcpy with SSSE3 and REP string. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3_rep +# define MEMCPY_CHK __memcpy_chk_ssse3_rep +#endif + +#ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +#else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ + addl $(TABLE - .), %ebx + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +#else +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +#endif + + .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +#ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $48, %ecx + jb L(bk_write_less48bytes) + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +#endif + cmp $48, %ecx + jae L(48bytesormore) + +L(fwd_write_less32bytes): +#ifndef USE_AS_MEMMOVE + cmp %dl, %al + jb L(bk_write) +#endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +#ifndef USE_AS_MEMMOVE +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +#endif + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(48bytesormore): + movdqu (%eax), %xmm0 + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + PUSH (%esi) + cfi_remember_state + add $16, %edx + movl %edi, %esi + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +#ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +#endif + + mov %eax, %edi + jae L(large_page) + and $0xf, %edi + jz L(shl_0) + + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + ALIGN (4) +L(shl_0): + movdqu %xmm0, (%esi) + xor %edi, %edi + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state +L(shl_0_gobble): + +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi +# else + mov __x86_data_cache_size_half, %edi +# endif +#endif + mov %edi, %esi + shr $3, %esi + sub %esi, %edi + cmp %edi, %ecx + jae L(shl_0_gobble_mem_start) + sub $128, %ecx + ALIGN (4) +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movaps 0x10(%eax), %xmm1 + movaps 0x20(%eax), %xmm2 + movaps 0x30(%eax), %xmm3 + movaps 0x40(%eax), %xmm4 + movaps 0x50(%eax), %xmm5 + movaps 0x60(%eax), %xmm6 + movaps 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm1, 0x10(%edx) + movaps %xmm2, 0x20(%edx) + movaps %xmm3, 0x30(%edx) + movaps %xmm4, 0x40(%edx) + movaps %xmm5, 0x50(%edx) + movaps %xmm6, 0x60(%edx) + movaps %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_cache_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_0_gobble_mem_start): + cmp %al, %dl + je L(copy_page_by_rep) + sub $128, %ecx +L(shl_0_gobble_mem_loop): + prefetchnta 0x1c0(%eax) + prefetchnta 0x280(%eax) + prefetchnta 0x1c0(%edx) + prefetchnta 0x280(%edx) + + movdqa (%eax), %xmm0 + movaps 0x10(%eax), %xmm1 + movaps 0x20(%eax), %xmm2 + movaps 0x30(%eax), %xmm3 + movaps 0x40(%eax), %xmm4 + movaps 0x50(%eax), %xmm5 + movaps 0x60(%eax), %xmm6 + movaps 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movaps %xmm1, 0x10(%edx) + movaps %xmm2, 0x20(%edx) + movaps %xmm3, 0x30(%edx) + movaps %xmm4, 0x40(%edx) + movaps %xmm5, 0x50(%edx) + movaps %xmm6, 0x60(%edx) + movaps %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_mem_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_1): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $1, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_1_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_1_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_1_loop) + +L(shl_1_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_2): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $2, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_2_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_2_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_2_loop) + +L(shl_2_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_3): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $3, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_3_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_3_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_3_loop) + +L(shl_3_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_4): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $4, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_4_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_4_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_4_loop) + +L(shl_4_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_5): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $5, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_5_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_5_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_5_loop) + +L(shl_5_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_6): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $6, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_6_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_6_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_6_loop) + +L(shl_6_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_7): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $7, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_7_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_7_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_7_loop) + +L(shl_7_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_8): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $8, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_8_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_8_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_8_loop) + +L(shl_8_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_9): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $9, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_9_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_9_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_9_loop) + +L(shl_9_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_10): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $10, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_10_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_10_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_10_loop) + +L(shl_10_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_11): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $11, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_11_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_11_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_11_loop) + +L(shl_11_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_12): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $12, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_12_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_12_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_12_loop) + +L(shl_12_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_13): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $13, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_13_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_13_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_13_loop) + +L(shl_13_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_14): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $14, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_14_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_14_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_14_loop) + +L(shl_14_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_15): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $15, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_15_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_15_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_15_loop) + +L(shl_15_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(fwd_write_44bytes): + movl -44(%eax), %ecx + movl %ecx, -44(%edx) +L(fwd_write_40bytes): + movl -40(%eax), %ecx + movl %ecx, -40(%edx) +L(fwd_write_36bytes): + movl -36(%eax), %ecx + movl %ecx, -36(%edx) +L(fwd_write_32bytes): + movl -32(%eax), %ecx + movl %ecx, -32(%edx) +L(fwd_write_28bytes): + movl -28(%eax), %ecx + movl %ecx, -28(%edx) +L(fwd_write_24bytes): + movl -24(%eax), %ecx + movl %ecx, -24(%edx) +L(fwd_write_20bytes): + movl -20(%eax), %ecx + movl %ecx, -20(%edx) +L(fwd_write_16bytes): + movl -16(%eax), %ecx + movl %ecx, -16(%edx) +L(fwd_write_12bytes): + movl -12(%eax), %ecx + movl %ecx, -12(%edx) +L(fwd_write_8bytes): + movl -8(%eax), %ecx + movl %ecx, -8(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +L(fwd_write_0bytes): +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_45bytes): + movl -45(%eax), %ecx + movl %ecx, -45(%edx) +L(fwd_write_41bytes): + movl -41(%eax), %ecx + movl %ecx, -41(%edx) +L(fwd_write_37bytes): + movl -37(%eax), %ecx + movl %ecx, -37(%edx) +L(fwd_write_33bytes): + movl -33(%eax), %ecx + movl %ecx, -33(%edx) +L(fwd_write_29bytes): + movl -29(%eax), %ecx + movl %ecx, -29(%edx) +L(fwd_write_25bytes): + movl -25(%eax), %ecx + movl %ecx, -25(%edx) +L(fwd_write_21bytes): + movl -21(%eax), %ecx + movl %ecx, -21(%edx) +L(fwd_write_17bytes): + movl -17(%eax), %ecx + movl %ecx, -17(%edx) +L(fwd_write_13bytes): + movl -13(%eax), %ecx + movl %ecx, -13(%edx) +L(fwd_write_9bytes): + movl -9(%eax), %ecx + movl %ecx, -9(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_46bytes): + movl -46(%eax), %ecx + movl %ecx, -46(%edx) +L(fwd_write_42bytes): + movl -42(%eax), %ecx + movl %ecx, -42(%edx) +L(fwd_write_38bytes): + movl -38(%eax), %ecx + movl %ecx, -38(%edx) +L(fwd_write_34bytes): + movl -34(%eax), %ecx + movl %ecx, -34(%edx) +L(fwd_write_30bytes): + movl -30(%eax), %ecx + movl %ecx, -30(%edx) +L(fwd_write_26bytes): + movl -26(%eax), %ecx + movl %ecx, -26(%edx) +L(fwd_write_22bytes): + movl -22(%eax), %ecx + movl %ecx, -22(%edx) +L(fwd_write_18bytes): + movl -18(%eax), %ecx + movl %ecx, -18(%edx) +L(fwd_write_14bytes): + movl -14(%eax), %ecx + movl %ecx, -14(%edx) +L(fwd_write_10bytes): + movl -10(%eax), %ecx + movl %ecx, -10(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_47bytes): + movl -47(%eax), %ecx + movl %ecx, -47(%edx) +L(fwd_write_43bytes): + movl -43(%eax), %ecx + movl %ecx, -43(%edx) +L(fwd_write_39bytes): + movl -39(%eax), %ecx + movl %ecx, -39(%edx) +L(fwd_write_35bytes): + movl -35(%eax), %ecx + movl %ecx, -35(%edx) +L(fwd_write_31bytes): + movl -31(%eax), %ecx + movl %ecx, -31(%edx) +L(fwd_write_27bytes): + movl -27(%eax), %ecx + movl %ecx, -27(%edx) +L(fwd_write_23bytes): + movl -23(%eax), %ecx + movl %ecx, -23(%edx) +L(fwd_write_19bytes): + movl -19(%eax), %ecx + movl %ecx, -19(%edx) +L(fwd_write_15bytes): + movl -15(%eax), %ecx + movl %ecx, -15(%edx) +L(fwd_write_11bytes): + movl -11(%eax), %ecx + movl %ecx, -11(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN_END + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(large_page): + movdqu (%eax), %xmm1 + movdqu %xmm0, (%esi) + movntdq %xmm1, (%edx) + add $0x10, %eax + add $0x10, %edx + sub $0x10, %ecx + cmp %al, %dl + je L(copy_page_by_rep) +L(large_page_loop_init): + POP (%esi) + sub $0x80, %ecx + POP (%edi) +L(large_page_loop): + prefetchnta 0x1c0(%eax) + prefetchnta 0x280(%eax) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + lfence + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jb L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(copy_page_by_rep): + mov %eax, %esi + mov %edx, %edi + mov %ecx, %edx + shr $2, %ecx + and $3, %edx + rep movsl + jz L(copy_page_by_rep_exit) + cmp $2, %edx + jb L(copy_page_by_rep_left_1) + movzwl (%esi), %eax + movw %ax, (%edi) + add $2, %esi + add $2, %edi + sub $2, %edx + jz L(copy_page_by_rep_exit) +L(copy_page_by_rep_left_1): + movzbl (%esi), %eax + movb %al, (%edi) +L(copy_page_by_rep_exit): + POP (%esi) + POP (%edi) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_44bytes): + movl 40(%eax), %ecx + movl %ecx, 40(%edx) +L(bk_write_40bytes): + movl 36(%eax), %ecx + movl %ecx, 36(%edx) +L(bk_write_36bytes): + movl 32(%eax), %ecx + movl %ecx, 32(%edx) +L(bk_write_32bytes): + movl 28(%eax), %ecx + movl %ecx, 28(%edx) +L(bk_write_28bytes): + movl 24(%eax), %ecx + movl %ecx, 24(%edx) +L(bk_write_24bytes): + movl 20(%eax), %ecx + movl %ecx, 20(%edx) +L(bk_write_20bytes): + movl 16(%eax), %ecx + movl %ecx, 16(%edx) +L(bk_write_16bytes): + movl 12(%eax), %ecx + movl %ecx, 12(%edx) +L(bk_write_12bytes): + movl 8(%eax), %ecx + movl %ecx, 8(%edx) +L(bk_write_8bytes): + movl 4(%eax), %ecx + movl %ecx, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_45bytes): + movl 41(%eax), %ecx + movl %ecx, 41(%edx) +L(bk_write_41bytes): + movl 37(%eax), %ecx + movl %ecx, 37(%edx) +L(bk_write_37bytes): + movl 33(%eax), %ecx + movl %ecx, 33(%edx) +L(bk_write_33bytes): + movl 29(%eax), %ecx + movl %ecx, 29(%edx) +L(bk_write_29bytes): + movl 25(%eax), %ecx + movl %ecx, 25(%edx) +L(bk_write_25bytes): + movl 21(%eax), %ecx + movl %ecx, 21(%edx) +L(bk_write_21bytes): + movl 17(%eax), %ecx + movl %ecx, 17(%edx) +L(bk_write_17bytes): + movl 13(%eax), %ecx + movl %ecx, 13(%edx) +L(bk_write_13bytes): + movl 9(%eax), %ecx + movl %ecx, 9(%edx) +L(bk_write_9bytes): + movl 5(%eax), %ecx + movl %ecx, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_46bytes): + movl 42(%eax), %ecx + movl %ecx, 42(%edx) +L(bk_write_42bytes): + movl 38(%eax), %ecx + movl %ecx, 38(%edx) +L(bk_write_38bytes): + movl 34(%eax), %ecx + movl %ecx, 34(%edx) +L(bk_write_34bytes): + movl 30(%eax), %ecx + movl %ecx, 30(%edx) +L(bk_write_30bytes): + movl 26(%eax), %ecx + movl %ecx, 26(%edx) +L(bk_write_26bytes): + movl 22(%eax), %ecx + movl %ecx, 22(%edx) +L(bk_write_22bytes): + movl 18(%eax), %ecx + movl %ecx, 18(%edx) +L(bk_write_18bytes): + movl 14(%eax), %ecx + movl %ecx, 14(%edx) +L(bk_write_14bytes): + movl 10(%eax), %ecx + movl %ecx, 10(%edx) +L(bk_write_10bytes): + movl 6(%eax), %ecx + movl %ecx, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_47bytes): + movl 43(%eax), %ecx + movl %ecx, 43(%edx) +L(bk_write_43bytes): + movl 39(%eax), %ecx + movl %ecx, 39(%edx) +L(bk_write_39bytes): + movl 35(%eax), %ecx + movl %ecx, 35(%edx) +L(bk_write_35bytes): + movl 31(%eax), %ecx + movl %ecx, 31(%edx) +L(bk_write_31bytes): + movl 27(%eax), %ecx + movl %ecx, 27(%edx) +L(bk_write_27bytes): + movl 23(%eax), %ecx + movl %ecx, 23(%edx) +L(bk_write_23bytes): + movl 19(%eax), %ecx + movl %ecx, 19(%edx) +L(bk_write_19bytes): + movl 15(%eax), %ecx + movl %ecx, 15(%edx) +L(bk_write_15bytes): + movl 11(%eax), %ecx + movl %ecx, 11(%edx) +L(bk_write_11bytes): + movl 7(%eax), %ecx + movl %ecx, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + ALIGN (2) +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + ALIGN (2) +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + ALIGN (2) +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +#ifdef USE_AS_MEMMOVE + ALIGN (4) +L(copy_backward): + PUSH (%esi) + movl %eax, %esi + add %ecx, %edx + add %ecx, %esi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jae L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jb L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movl -4(%esi), %eax + movl %eax, -4(%edx) + movl -8(%esi), %eax + movl %eax, -8(%edx) + movl -12(%esi), %eax + movl %eax, -12(%edx) + movl -16(%esi), %eax + movl %eax, -16(%edx) + movl -20(%esi), %eax + movl %eax, -20(%edx) + movl -24(%esi), %eax + movl %eax, -24(%edx) + movl -28(%esi), %eax + movl %eax, -28(%edx) + movl -32(%esi), %eax + movl %eax, -32(%edx) + sub $32, %edx + sub $32, %esi + +L(bk_write_less32bytes): + movl %esi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%esi) +L(bk_write_less48bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + CFI_PUSH (%esi) + ALIGN (4) +L(bk_align): + cmp $8, %ecx + jbe L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %esi + sub $1, %ecx + sub $1, %edx + movzbl (%esi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %esi + sub $2, %ecx + sub $2, %edx + movzwl (%esi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + ALIGN (4) +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jb L(bk_write_more32bytes) + +L(bk_ssse3_cpy): + sub $64, %esi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%esi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%esi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%esi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%esi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jae L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +#endif + +END (MEMCPY) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S new file mode 100644 index 0000000000..53e8a6ca1d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S @@ -0,0 +1,3162 @@ +/* memcpy with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +# include <sysdep.h> +# include "asm-syntax.h" + +# ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# endif + +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx, INDEX, SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +# else + +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(, INDEX, SCALE) +# endif + + .section .text.ssse3,"ax",@progbits +# if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +# endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +# ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $32, %ecx + jae L(memmove_bwd) + jmp L(bk_write_less32bytes_2) + + .p2align 4 +L(memmove_bwd): + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +# endif + cmp $48, %ecx + jae L(48bytesormore) + +L(fwd_write_less32bytes): +# ifndef USE_AS_MEMMOVE + cmp %dl, %al + jb L(bk_write) +# endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +# ifndef USE_AS_MEMMOVE + .p2align 4 +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +# endif + + .p2align 4 +L(48bytesormore): +# ifndef USE_AS_MEMMOVE + movlpd (%eax), %xmm0 + movlpd 8(%eax), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) +# else + movdqu (%eax), %xmm0 +# endif + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + add $16, %edx + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +# endif + + mov %eax, %edi + jae L(large_page) + and $0xf, %edi + jz L(shl_0) + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + .p2align 4 +L(shl_0): +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif + xor %edi, %edi + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx + + .p2align 4 +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_0_gobble): +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + POP (%edi) + lea -128(%ecx), %ecx + jae L(shl_0_gobble_mem_loop) + + .p2align 4 +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_cache_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx + +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx + +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx + +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(shl_0_gobble_mem_loop): + prefetcht0 0x1c0(%eax) + prefetcht0 0x280(%eax) + prefetcht0 0x1c0(%edx) + + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_mem_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx + +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx + +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx + +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) + + .p2align 4 +L(shl_1): +# ifndef USE_AS_MEMMOVE + movaps -1(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -1(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_1_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl1LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + movaps 47(%eax), %xmm4 + movaps 63(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + palignr $1, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $1, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $1, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl1LoopStart) + +L(Shl1LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_1_no_prefetch): + lea -32(%ecx), %ecx + lea -1(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_1_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_1_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_1_no_prefetch_loop) + +L(sh_1_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_2): +# ifndef USE_AS_MEMMOVE + movaps -2(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -2(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_2_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl2LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + movaps 46(%eax), %xmm4 + movaps 62(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + palignr $2, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $2, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $2, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl2LoopStart) + +L(Shl2LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_2_no_prefetch): + lea -32(%ecx), %ecx + lea -2(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_2_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_2_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_2_no_prefetch_loop) + +L(sh_2_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_3): +# ifndef USE_AS_MEMMOVE + movaps -3(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -3(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_3_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl3LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + movaps 45(%eax), %xmm4 + movaps 61(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + palignr $3, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $3, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $3, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl3LoopStart) + +L(Shl3LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_3_no_prefetch): + lea -32(%ecx), %ecx + lea -3(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_3_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_3_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_3_no_prefetch_loop) + +L(sh_3_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_4): +# ifndef USE_AS_MEMMOVE + movaps -4(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -4(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_4_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl4LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + movaps 44(%eax), %xmm4 + movaps 60(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + palignr $4, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $4, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $4, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl4LoopStart) + +L(Shl4LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_4_no_prefetch): + lea -32(%ecx), %ecx + lea -4(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_4_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_4_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_4_no_prefetch_loop) + +L(sh_4_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_5): +# ifndef USE_AS_MEMMOVE + movaps -5(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -5(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_5_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl5LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + movaps 43(%eax), %xmm4 + movaps 59(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + palignr $5, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $5, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $5, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl5LoopStart) + +L(Shl5LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_5_no_prefetch): + lea -32(%ecx), %ecx + lea -5(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_5_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_5_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_5_no_prefetch_loop) + +L(sh_5_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_6): +# ifndef USE_AS_MEMMOVE + movaps -6(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -6(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_6_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl6LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + movaps 42(%eax), %xmm4 + movaps 58(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + palignr $6, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $6, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $6, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl6LoopStart) + +L(Shl6LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_6_no_prefetch): + lea -32(%ecx), %ecx + lea -6(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_6_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_6_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_6_no_prefetch_loop) + +L(sh_6_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_7): +# ifndef USE_AS_MEMMOVE + movaps -7(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -7(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_7_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl7LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + movaps 41(%eax), %xmm4 + movaps 57(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + palignr $7, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $7, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $7, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl7LoopStart) + +L(Shl7LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_7_no_prefetch): + lea -32(%ecx), %ecx + lea -7(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_7_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_7_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_7_no_prefetch_loop) + +L(sh_7_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_8): +# ifndef USE_AS_MEMMOVE + movaps -8(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -8(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_8_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl8LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + movaps 40(%eax), %xmm4 + movaps 56(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + palignr $8, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $8, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $8, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl8LoopStart) + +L(LoopLeave8): + add $32, %ecx + jle L(shl_end_0) + + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_8_no_prefetch): + lea -32(%ecx), %ecx + lea -8(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_8_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_8_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_8_no_prefetch_loop) + +L(sh_8_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_9): +# ifndef USE_AS_MEMMOVE + movaps -9(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -9(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_9_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl9LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + movaps 39(%eax), %xmm4 + movaps 55(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + palignr $9, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $9, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $9, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl9LoopStart) + +L(Shl9LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_9_no_prefetch): + lea -32(%ecx), %ecx + lea -9(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_9_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_9_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_9_no_prefetch_loop) + +L(sh_9_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_10): +# ifndef USE_AS_MEMMOVE + movaps -10(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -10(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_10_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl10LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + movaps 38(%eax), %xmm4 + movaps 54(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + palignr $10, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $10, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $10, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl10LoopStart) + +L(Shl10LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_10_no_prefetch): + lea -32(%ecx), %ecx + lea -10(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_10_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_10_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_10_no_prefetch_loop) + +L(sh_10_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_11): +# ifndef USE_AS_MEMMOVE + movaps -11(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -11(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_11_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl11LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + movaps 37(%eax), %xmm4 + movaps 53(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + palignr $11, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $11, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $11, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl11LoopStart) + +L(Shl11LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_11_no_prefetch): + lea -32(%ecx), %ecx + lea -11(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_11_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_11_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_11_no_prefetch_loop) + +L(sh_11_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_12): +# ifndef USE_AS_MEMMOVE + movaps -12(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -12(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_12_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl12LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + movaps 36(%eax), %xmm4 + movaps 52(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + palignr $12, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $12, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $12, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl12LoopStart) + +L(Shl12LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_12_no_prefetch): + lea -32(%ecx), %ecx + lea -12(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_12_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_12_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_12_no_prefetch_loop) + +L(sh_12_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_13): +# ifndef USE_AS_MEMMOVE + movaps -13(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -13(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_13_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl13LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + movaps 35(%eax), %xmm4 + movaps 51(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + palignr $13, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $13, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $13, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl13LoopStart) + +L(Shl13LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_13_no_prefetch): + lea -32(%ecx), %ecx + lea -13(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_13_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_13_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_13_no_prefetch_loop) + +L(sh_13_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_14): +# ifndef USE_AS_MEMMOVE + movaps -14(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -14(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_14_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl14LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + movaps 34(%eax), %xmm4 + movaps 50(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + palignr $14, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $14, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $14, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl14LoopStart) + +L(Shl14LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_14_no_prefetch): + lea -32(%ecx), %ecx + lea -14(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_14_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_14_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_14_no_prefetch_loop) + +L(sh_14_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_15): +# ifndef USE_AS_MEMMOVE + movaps -15(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -15(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_15_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl15LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + movaps 33(%eax), %xmm4 + movaps 49(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + palignr $15, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $15, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $15, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl15LoopStart) + +L(Shl15LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_15_no_prefetch): + lea -32(%ecx), %ecx + lea -15(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_15_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_15_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_15_no_prefetch_loop) + +L(sh_15_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_end_0): + lea 32(%ecx), %ecx + lea (%edx, %ecx), %edx + lea (%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(fwd_write_44bytes): + movq -44(%eax), %xmm0 + movq %xmm0, -44(%edx) +L(fwd_write_36bytes): + movq -36(%eax), %xmm0 + movq %xmm0, -36(%edx) +L(fwd_write_28bytes): + movq -28(%eax), %xmm0 + movq %xmm0, -28(%edx) +L(fwd_write_20bytes): + movq -20(%eax), %xmm0 + movq %xmm0, -20(%edx) +L(fwd_write_12bytes): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_40bytes): + movq -40(%eax), %xmm0 + movq %xmm0, -40(%edx) +L(fwd_write_32bytes): + movq -32(%eax), %xmm0 + movq %xmm0, -32(%edx) +L(fwd_write_24bytes): + movq -24(%eax), %xmm0 + movq %xmm0, -24(%edx) +L(fwd_write_16bytes): + movq -16(%eax), %xmm0 + movq %xmm0, -16(%edx) +L(fwd_write_8bytes): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes): +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_45bytes): + movq -45(%eax), %xmm0 + movq %xmm0, -45(%edx) +L(fwd_write_37bytes): + movq -37(%eax), %xmm0 + movq %xmm0, -37(%edx) +L(fwd_write_29bytes): + movq -29(%eax), %xmm0 + movq %xmm0, -29(%edx) +L(fwd_write_21bytes): + movq -21(%eax), %xmm0 + movq %xmm0, -21(%edx) +L(fwd_write_13bytes): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_41bytes): + movq -41(%eax), %xmm0 + movq %xmm0, -41(%edx) +L(fwd_write_33bytes): + movq -33(%eax), %xmm0 + movq %xmm0, -33(%edx) +L(fwd_write_25bytes): + movq -25(%eax), %xmm0 + movq %xmm0, -25(%edx) +L(fwd_write_17bytes): + movq -17(%eax), %xmm0 + movq %xmm0, -17(%edx) +L(fwd_write_9bytes): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_46bytes): + movq -46(%eax), %xmm0 + movq %xmm0, -46(%edx) +L(fwd_write_38bytes): + movq -38(%eax), %xmm0 + movq %xmm0, -38(%edx) +L(fwd_write_30bytes): + movq -30(%eax), %xmm0 + movq %xmm0, -30(%edx) +L(fwd_write_22bytes): + movq -22(%eax), %xmm0 + movq %xmm0, -22(%edx) +L(fwd_write_14bytes): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_42bytes): + movq -42(%eax), %xmm0 + movq %xmm0, -42(%edx) +L(fwd_write_34bytes): + movq -34(%eax), %xmm0 + movq %xmm0, -34(%edx) +L(fwd_write_26bytes): + movq -26(%eax), %xmm0 + movq %xmm0, -26(%edx) +L(fwd_write_18bytes): + movq -18(%eax), %xmm0 + movq %xmm0, -18(%edx) +L(fwd_write_10bytes): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_47bytes): + movq -47(%eax), %xmm0 + movq %xmm0, -47(%edx) +L(fwd_write_39bytes): + movq -39(%eax), %xmm0 + movq %xmm0, -39(%edx) +L(fwd_write_31bytes): + movq -31(%eax), %xmm0 + movq %xmm0, -31(%edx) +L(fwd_write_23bytes): + movq -23(%eax), %xmm0 + movq %xmm0, -23(%edx) +L(fwd_write_15bytes): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_43bytes): + movq -43(%eax), %xmm0 + movq %xmm0, -43(%edx) +L(fwd_write_35bytes): + movq -35(%eax), %xmm0 + movq %xmm0, -35(%edx) +L(fwd_write_27bytes): + movq -27(%eax), %xmm0 + movq %xmm0, -27(%edx) +L(fwd_write_19bytes): + movq -19(%eax), %xmm0 + movq %xmm0, -19(%edx) +L(fwd_write_11bytes): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_40bytes_align): + movdqa -40(%eax), %xmm0 + movdqa %xmm0, -40(%edx) +L(fwd_write_24bytes_align): + movdqa -24(%eax), %xmm0 + movdqa %xmm0, -24(%edx) +L(fwd_write_8bytes_align): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes_align): +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_32bytes_align): + movdqa -32(%eax), %xmm0 + movdqa %xmm0, -32(%edx) +L(fwd_write_16bytes_align): + movdqa -16(%eax), %xmm0 + movdqa %xmm0, -16(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_5bytes_align): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_45bytes_align): + movdqa -45(%eax), %xmm0 + movdqa %xmm0, -45(%edx) +L(fwd_write_29bytes_align): + movdqa -29(%eax), %xmm0 + movdqa %xmm0, -29(%edx) +L(fwd_write_13bytes_align): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_37bytes_align): + movdqa -37(%eax), %xmm0 + movdqa %xmm0, -37(%edx) +L(fwd_write_21bytes_align): + movdqa -21(%eax), %xmm0 + movdqa %xmm0, -21(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_41bytes_align): + movdqa -41(%eax), %xmm0 + movdqa %xmm0, -41(%edx) +L(fwd_write_25bytes_align): + movdqa -25(%eax), %xmm0 + movdqa %xmm0, -25(%edx) +L(fwd_write_9bytes_align): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes_align): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_33bytes_align): + movdqa -33(%eax), %xmm0 + movdqa %xmm0, -33(%edx) +L(fwd_write_17bytes_align): + movdqa -17(%eax), %xmm0 + movdqa %xmm0, -17(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_46bytes_align): + movdqa -46(%eax), %xmm0 + movdqa %xmm0, -46(%edx) +L(fwd_write_30bytes_align): + movdqa -30(%eax), %xmm0 + movdqa %xmm0, -30(%edx) +L(fwd_write_14bytes_align): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes_align): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_38bytes_align): + movdqa -38(%eax), %xmm0 + movdqa %xmm0, -38(%edx) +L(fwd_write_22bytes_align): + movdqa -22(%eax), %xmm0 + movdqa %xmm0, -22(%edx) + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_42bytes_align): + movdqa -42(%eax), %xmm0 + movdqa %xmm0, -42(%edx) +L(fwd_write_26bytes_align): + movdqa -26(%eax), %xmm0 + movdqa %xmm0, -26(%edx) +L(fwd_write_10bytes_align): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes_align): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_34bytes_align): + movdqa -34(%eax), %xmm0 + movdqa %xmm0, -34(%edx) +L(fwd_write_18bytes_align): + movdqa -18(%eax), %xmm0 + movdqa %xmm0, -18(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_47bytes_align): + movdqa -47(%eax), %xmm0 + movdqa %xmm0, -47(%edx) +L(fwd_write_31bytes_align): + movdqa -31(%eax), %xmm0 + movdqa %xmm0, -31(%edx) +L(fwd_write_15bytes_align): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes_align): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_39bytes_align): + movdqa -39(%eax), %xmm0 + movdqa %xmm0, -39(%edx) +L(fwd_write_23bytes_align): + movdqa -23(%eax), %xmm0 + movdqa %xmm0, -23(%edx) + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_43bytes_align): + movdqa -43(%eax), %xmm0 + movdqa %xmm0, -43(%edx) +L(fwd_write_27bytes_align): + movdqa -27(%eax), %xmm0 + movdqa %xmm0, -27(%edx) +L(fwd_write_11bytes_align): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes_align): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_35bytes_align): + movdqa -35(%eax), %xmm0 + movdqa %xmm0, -35(%edx) +L(fwd_write_19bytes_align): + movdqa -19(%eax), %xmm0 + movdqa %xmm0, -19(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_44bytes_align): + movdqa -44(%eax), %xmm0 + movdqa %xmm0, -44(%edx) +L(fwd_write_28bytes_align): + movdqa -28(%eax), %xmm0 + movdqa %xmm0, -28(%edx) +L(fwd_write_12bytes_align): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes_align): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_36bytes_align): + movdqa -36(%eax), %xmm0 + movdqa %xmm0, -36(%edx) +L(fwd_write_20bytes_align): + movdqa -20(%eax), %xmm0 + movdqa %xmm0, -20(%edx) + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN_END + + CFI_PUSH (%edi) + + .p2align 4 +L(large_page): + movdqu (%eax), %xmm1 +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif + lea 16(%eax), %eax + movntdq %xmm1, (%edx) + lea 16(%edx), %edx + lea -0x90(%ecx), %ecx + POP (%edi) + + .p2align 4 +L(large_page_loop): + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jb L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(bk_write_44bytes): + movq 36(%eax), %xmm0 + movq %xmm0, 36(%edx) +L(bk_write_36bytes): + movq 28(%eax), %xmm0 + movq %xmm0, 28(%edx) +L(bk_write_28bytes): + movq 20(%eax), %xmm0 + movq %xmm0, 20(%edx) +L(bk_write_20bytes): + movq 12(%eax), %xmm0 + movq %xmm0, 12(%edx) +L(bk_write_12bytes): + movq 4(%eax), %xmm0 + movq %xmm0, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_40bytes): + movq 32(%eax), %xmm0 + movq %xmm0, 32(%edx) +L(bk_write_32bytes): + movq 24(%eax), %xmm0 + movq %xmm0, 24(%edx) +L(bk_write_24bytes): + movq 16(%eax), %xmm0 + movq %xmm0, 16(%edx) +L(bk_write_16bytes): + movq 8(%eax), %xmm0 + movq %xmm0, 8(%edx) +L(bk_write_8bytes): + movq (%eax), %xmm0 + movq %xmm0, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_45bytes): + movq 37(%eax), %xmm0 + movq %xmm0, 37(%edx) +L(bk_write_37bytes): + movq 29(%eax), %xmm0 + movq %xmm0, 29(%edx) +L(bk_write_29bytes): + movq 21(%eax), %xmm0 + movq %xmm0, 21(%edx) +L(bk_write_21bytes): + movq 13(%eax), %xmm0 + movq %xmm0, 13(%edx) +L(bk_write_13bytes): + movq 5(%eax), %xmm0 + movq %xmm0, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_41bytes): + movq 33(%eax), %xmm0 + movq %xmm0, 33(%edx) +L(bk_write_33bytes): + movq 25(%eax), %xmm0 + movq %xmm0, 25(%edx) +L(bk_write_25bytes): + movq 17(%eax), %xmm0 + movq %xmm0, 17(%edx) +L(bk_write_17bytes): + movq 9(%eax), %xmm0 + movq %xmm0, 9(%edx) +L(bk_write_9bytes): + movq 1(%eax), %xmm0 + movq %xmm0, 1(%edx) + movzbl (%eax), %ecx + movb %cl, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_46bytes): + movq 38(%eax), %xmm0 + movq %xmm0, 38(%edx) +L(bk_write_38bytes): + movq 30(%eax), %xmm0 + movq %xmm0, 30(%edx) +L(bk_write_30bytes): + movq 22(%eax), %xmm0 + movq %xmm0, 22(%edx) +L(bk_write_22bytes): + movq 14(%eax), %xmm0 + movq %xmm0, 14(%edx) +L(bk_write_14bytes): + movq 6(%eax), %xmm0 + movq %xmm0, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) + movzwl (%eax), %ecx + movw %cx, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_42bytes): + movq 34(%eax), %xmm0 + movq %xmm0, 34(%edx) +L(bk_write_34bytes): + movq 26(%eax), %xmm0 + movq %xmm0, 26(%edx) +L(bk_write_26bytes): + movq 18(%eax), %xmm0 + movq %xmm0, 18(%edx) +L(bk_write_18bytes): + movq 10(%eax), %xmm0 + movq %xmm0, 10(%edx) +L(bk_write_10bytes): + movq 2(%eax), %xmm0 + movq %xmm0, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_47bytes): + movq 39(%eax), %xmm0 + movq %xmm0, 39(%edx) +L(bk_write_39bytes): + movq 31(%eax), %xmm0 + movq %xmm0, 31(%edx) +L(bk_write_31bytes): + movq 23(%eax), %xmm0 + movq %xmm0, 23(%edx) +L(bk_write_23bytes): + movq 15(%eax), %xmm0 + movq %xmm0, 15(%edx) +L(bk_write_15bytes): + movq 7(%eax), %xmm0 + movq %xmm0, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_43bytes): + movq 35(%eax), %xmm0 + movq %xmm0, 35(%edx) +L(bk_write_35bytes): + movq 27(%eax), %xmm0 + movq %xmm0, 27(%edx) +L(bk_write_27bytes): + movq 19(%eax), %xmm0 + movq %xmm0, 19(%edx) +L(bk_write_19bytes): + movq 11(%eax), %xmm0 + movq %xmm0, 11(%edx) +L(bk_write_11bytes): + movq 3(%eax), %xmm0 + movq %xmm0, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + .p2align 2 +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + .p2align 2 +L(table_48bytes_fwd_align): + .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) + + .p2align 2 +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + .p2align 2 +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +# ifdef USE_AS_MEMMOVE + .p2align 4 +L(copy_backward): + PUSH (%edi) + movl %eax, %edi + lea (%ecx,%edx,1),%edx + lea (%ecx,%edi,1),%edi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jae L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jb L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movq -8(%edi), %xmm0 + movq %xmm0, -8(%edx) + movq -16(%edi), %xmm0 + movq %xmm0, -16(%edx) + movq -24(%edi), %xmm0 + movq %xmm0, -24(%edx) + movq -32(%edi), %xmm0 + movq %xmm0, -32(%edx) + sub $32, %edx + sub $32, %edi + +L(bk_write_less32bytes): + movl %edi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%edi) +L(bk_write_less32bytes_2): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(bk_align): + cmp $8, %ecx + jbe L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %edi + sub $1, %ecx + sub $1, %edx + movzbl (%edi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %edi + sub $2, %ecx + sub $2, %edx + movzwl (%edi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + .p2align 4 +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jb L(bk_write_more32bytes) + + .p2align 4 +L(bk_ssse3_cpy): + sub $64, %edi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%edi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%edi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%edi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%edi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jae L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +# endif + +END (MEMCPY) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S new file mode 100644 index 0000000000..f725944620 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S @@ -0,0 +1,78 @@ +/* Multiple versions of memcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need memcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(memcpy) + .type memcpy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memcpy_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep) +2: ret +END(memcpy) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcpy_ia32, @function; \ + .p2align 4; \ + .globl __memcpy_ia32; \ + .hidden __memcpy_ia32; \ + __memcpy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memcpy_chk_ia32, @function; \ + .globl __memcpy_chk_ia32; \ + .p2align 4; \ + __memcpy_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32 + +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcpy; __GI_memcpy = __memcpy_ia32 +#endif + +#include "../memcpy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S new file mode 100644 index 0000000000..1b4fbe2e6f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S @@ -0,0 +1,50 @@ +/* Multiple versions of __memcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch memcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__memcpy_chk) + .type __memcpy_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep) +2: ret +END(__memcpy_chk) +# else +# include "../memcpy_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S new file mode 100644 index 0000000000..3873594cb2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_sse2_unaligned +#define MEMCPY_CHK __memmove_chk_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S new file mode 100644 index 0000000000..d202fc4a13 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3_rep +#define MEMCPY_CHK __memmove_chk_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S new file mode 100644 index 0000000000..295430b1ef --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3 +#define MEMCPY_CHK __memmove_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S new file mode 100644 index 0000000000..6eb418ca7f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S @@ -0,0 +1,89 @@ +/* Multiple versions of memmove + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(memmove) + .type memmove, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memmove_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep) +2: ret +END(memmove) + +# ifdef SHARED +# undef ENTRY +# define ENTRY(name) \ + .type __memmove_ia32, @function; \ + .p2align 4; \ + .globl __memmove_ia32; \ + .hidden __memmove_ia32; \ + __memmove_ia32: cfi_startproc; \ + CALL_MCOUNT +# else +# undef ENTRY +# define ENTRY(name) \ + .type __memmove_ia32, @function; \ + .globl __memmove_ia32; \ + .p2align 4; \ + __memmove_ia32: cfi_startproc; \ + CALL_MCOUNT +# endif + +# undef END +# define END(name) \ + cfi_endproc; .size __memmove_ia32, .-__memmove_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memmove_chk_ia32, @function; \ + .globl __memmove_chk_ia32; \ + .p2align 4; \ + __memmove_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memmove; __GI_memmove = __memmove_ia32 +# endif +#endif + +#include "../memmove.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S new file mode 100644 index 0000000000..314834c4c6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S @@ -0,0 +1,94 @@ +/* Multiple versions of __memmove_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(__memmove_chk) + .type __memmove_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memmove_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep) +2: ret +END(__memmove_chk) + +# ifndef SHARED + .type __memmove_chk_sse2_unaligned, @function + .p2align 4; +__memmove_chk_sse2_unaligned: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_sse2_unaligned + cfi_endproc + .size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned + + .type __memmove_chk_ssse3, @function + .p2align 4; +__memmove_chk_ssse3: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ssse3 + cfi_endproc + .size __memmove_chk_ssse3, .-__memmove_chk_ssse3 + + .type __memmove_chk_ssse3_rep, @function + .p2align 4; +__memmove_chk_ssse3_rep: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ssse3_rep + cfi_endproc + .size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep + + .type __memmove_chk_ia32, @function + .p2align 4; +__memmove_chk_ia32: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ia32 + cfi_endproc + .size __memmove_chk_ia32, .-__memmove_chk_ia32 +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S new file mode 100644 index 0000000000..a1cea50771 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_sse2_unaligned +#define MEMCPY_CHK __mempcpy_chk_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S new file mode 100644 index 0000000000..5357b33e18 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_ssse3_rep +#define MEMCPY_CHK __mempcpy_chk_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S new file mode 100644 index 0000000000..822d98e954 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_ssse3 +#define MEMCPY_CHK __mempcpy_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S new file mode 100644 index 0000000000..06e377fbc9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S @@ -0,0 +1,81 @@ +/* Multiple versions of mempcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need mempcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(__mempcpy) + .type __mempcpy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__mempcpy_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep) +2: ret +END(__mempcpy) + +# undef ENTRY +# define ENTRY(name) \ + .type __mempcpy_ia32, @function; \ + .p2align 4; \ + .globl __mempcpy_ia32; \ + .hidden __mempcpy_ia32; \ + __mempcpy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __mempcpy_chk_ia32, @function; \ + .globl __mempcpy_chk_ia32; \ + .p2align 4; \ + __mempcpy_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32 + +# undef libc_hidden_def +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_def(name) \ + .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32 +# define libc_hidden_builtin_def(name) \ + .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32 +#endif + +#include "../mempcpy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S new file mode 100644 index 0000000000..e13e5248a5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S @@ -0,0 +1,50 @@ +/* Multiple versions of __mempcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch mempcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__mempcpy_chk) + .type __mempcpy_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep) +2: ret +END(__mempcpy_chk) +# else +# include "../mempcpy_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c new file mode 100644 index 0000000000..ef7bbbe792 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c @@ -0,0 +1,7 @@ +#if IS_IN (libc) +# define MEMRCHR __memrchr_ia32 +# include <string.h> +extern void *__memrchr_ia32 (const void *, int, size_t); +#endif + +#include "string/memrchr.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S new file mode 100644 index 0000000000..dbbe94fd08 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S @@ -0,0 +1,417 @@ +/* Optimized memrchr with sse2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# define MEMCHR __memrchr_sse2_bsf + + .text +ENTRY (MEMCHR) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + add $16, %ecx + add $16, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches0) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + add $64, %ecx + add $64, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + bsr %eax, %eax + + add %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsr %eax, %eax + add %ecx, %eax + ret + + .p2align 4 +L(matches16): + bsr %eax, %eax + lea 16(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches32): + bsr %eax, %eax + lea 32(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches48): + bsr %eax, %eax + lea 48(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches0_1): + bsr %eax, %eax + sub $64, %edx + add %eax, %edx + jl L(return_null) + add %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + bsr %eax, %eax + sub $48, %edx + add %eax, %edx + jl L(return_null) + lea 16(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches32_1): + bsr %eax, %eax + sub $32, %edx + add %eax, %edx + jl L(return_null) + lea 32(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches48_1): + bsr %eax, %eax + sub $16, %edx + add %eax, %edx + jl L(return_null) + lea 48(%ecx, %eax), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + mov %edx, %ecx + + pmovmskb %xmm1, %edx + + and %ecx, %edx + test %edx, %edx + jz L(return_null) + + bsr %edx, %ecx + add %ecx, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + mov %ecx, %eax + punpcklbw %xmm1, %xmm1 + add $16, %edx + jz L(return_null) + + pshufd $0, %xmm1, %xmm1 + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (MEMCHR) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S new file mode 100644 index 0000000000..5f7853f683 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S @@ -0,0 +1,724 @@ +/* Optimized memrchr with sse2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + + atom_text_section +ENTRY (__memrchr_sse2) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(exit_dispatch) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + lea 16(%ecx), %ecx + lea 16(%edx), %edx + sub %eax, %edx + and $-16, %ecx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(exit_dispatch) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(exit_dispatch) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + lea 64(%ecx), %ecx + lea 64(%edx), %edx + and $-64, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches16): + lea 16(%ecx), %ecx + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(matches32): + lea 32(%ecx), %ecx + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(matches48): + lea 48(%ecx), %ecx + + .p2align 4 +L(exit_dispatch): + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_dispatch_8): + test $0x80, %al + jnz L(exit_8) + test $0x40, %al + jnz L(exit_7) + test $0x20, %al + jnz L(exit_6) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_high): + mov %ah, %dh + and $15 << 4, %dh + jnz L(exit_dispatch_high_8) + test $0x08, %ah + jnz L(exit_12) + test $0x04, %ah + jnz L(exit_11) + test $0x02, %ah + jnz L(exit_10) + lea 8(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_high_8): + test $0x80, %ah + jnz L(exit_16) + test $0x40, %ah + jnz L(exit_15) + test $0x20, %ah + jnz L(exit_14) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit_2): + lea 1(%ecx), %eax + ret + + .p2align 4 +L(exit_3): + lea 2(%ecx), %eax + ret + + .p2align 4 +L(exit_4): + lea 3(%ecx), %eax + ret + + .p2align 4 +L(exit_6): + lea 5(%ecx), %eax + ret + + .p2align 4 +L(exit_7): + lea 6(%ecx), %eax + ret + + .p2align 4 +L(exit_8): + lea 7(%ecx), %eax + ret + + .p2align 4 +L(exit_10): + lea 9(%ecx), %eax + ret + + .p2align 4 +L(exit_11): + lea 10(%ecx), %eax + ret + + .p2align 4 +L(exit_12): + lea 11(%ecx), %eax + ret + + .p2align 4 +L(exit_14): + lea 13(%ecx), %eax + ret + + .p2align 4 +L(exit_15): + lea 14(%ecx), %eax + ret + + .p2align 4 +L(exit_16): + lea 15(%ecx), %eax + ret + + .p2align 4 +L(matches0_1): + lea -64(%edx), %edx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + lea -48(%edx), %edx + lea 16(%ecx), %ecx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches32_1): + lea -32(%edx), %edx + lea 32(%ecx), %ecx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches48_1): + lea -16(%edx), %edx + lea 48(%ecx), %ecx + + .p2align 4 +L(exit_dispatch_1): + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_dispatch_1_8): + test $0x80, %al + jnz L(exit_1_8) + test $0x40, %al + jnz L(exit_1_7) + test $0x20, %al + jnz L(exit_1_6) + add $4, %edx + jl L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_1_high): + mov %ah, %al + and $15 << 4, %al + jnz L(exit_dispatch_1_high_8) + test $0x08, %ah + jnz L(exit_1_12) + test $0x04, %ah + jnz L(exit_1_11) + test $0x02, %ah + jnz L(exit_1_10) + add $8, %edx + jl L(return_null) + lea 8(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_1_high_8): + test $0x80, %ah + jnz L(exit_1_16) + test $0x40, %ah + jnz L(exit_1_15) + test $0x20, %ah + jnz L(exit_1_14) + add $12, %edx + jl L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit_1_2): + add $1, %edx + jl L(return_null) + lea 1(%ecx), %eax + ret + + .p2align 4 +L(exit_1_3): + add $2, %edx + jl L(return_null) + lea 2(%ecx), %eax + ret + + .p2align 4 +L(exit_1_4): + add $3, %edx + jl L(return_null) + lea 3(%ecx), %eax + ret + + .p2align 4 +L(exit_1_6): + add $5, %edx + jl L(return_null) + lea 5(%ecx), %eax + ret + + .p2align 4 +L(exit_1_7): + add $6, %edx + jl L(return_null) + lea 6(%ecx), %eax + ret + + .p2align 4 +L(exit_1_8): + add $7, %edx + jl L(return_null) + lea 7(%ecx), %eax + ret + + .p2align 4 +L(exit_1_10): + add $9, %edx + jl L(return_null) + lea 9(%ecx), %eax + ret + + .p2align 4 +L(exit_1_11): + add $10, %edx + jl L(return_null) + lea 10(%ecx), %eax + ret + + .p2align 4 +L(exit_1_12): + add $11, %edx + jl L(return_null) + lea 11(%ecx), %eax + ret + + .p2align 4 +L(exit_1_14): + add $13, %edx + jl L(return_null) + lea 13(%ecx), %eax + ret + + .p2align 4 +L(exit_1_15): + add $14, %edx + jl L(return_null) + lea 14(%ecx), %eax + ret + + .p2align 4 +L(exit_1_16): + add $15, %edx + jl L(return_null) + lea 15(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + mov %eax, %ecx + pmovmskb %xmm1, %eax + + and %edx, %eax + test %eax, %eax + jnz L(exit_dispatch) + + xor %eax, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + add $16, %edx + je L(return_null) + punpcklbw %xmm1, %xmm1 + + mov %ecx, %eax + pshufd $0, %xmm1, %xmm1 + + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (__memrchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S new file mode 100644 index 0000000000..d4253a553b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S @@ -0,0 +1,45 @@ +/* Multiple versions of memrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__memrchr) + .type __memrchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX (__memrchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__memrchr_ia32) + ret + +3: LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf) + ret +END(__memrchr) + +weak_alias(__memrchr, memrchr) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S new file mode 100644 index 0000000000..3221077e49 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S @@ -0,0 +1,811 @@ +/* memset with SSE2 and REP string. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_BZERO +# define DEST PARMS +# define LEN DEST+4 +# define SETRTNVAL +#else +# define DEST PARMS +# define CHR DEST+4 +# define LEN CHR+4 +# define SETRTNVAL movl DEST(%esp), %eax +#endif + +#ifdef SHARED +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define PARMS 8 /* Preserve EBX. */ +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + add $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + add (%ebx,%ecx,4), %ebx; \ + add %ecx, %edx; \ + /* We loaded the jump table and adjusted EDX. Go. */ \ + jmp *%ebx +#else +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define PARMS 4 +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + add %ecx, %edx; \ + jmp *TABLE(,%ecx,4) +#endif + + .section .text.sse2,"ax",@progbits +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO +ENTRY (__memset_chk_sse2_rep) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_sse2_rep) +#endif +ENTRY (__memset_sse2_rep) + ENTRANCE + + movl LEN(%esp), %ecx +#ifdef USE_AS_BZERO + xor %eax, %eax +#else + movzbl CHR(%esp), %eax + movb %al, %ah + /* Fill the whole EAX with pattern. */ + movl %eax, %edx + shl $16, %eax + or %edx, %eax +#endif + movl DEST(%esp), %edx + cmp $32, %ecx + jae L(32bytesormore) + +L(write_less32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_less_32bytes): + .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) + .popsection + + ALIGN (4) +L(write_28bytes): + movl %eax, -28(%edx) +L(write_24bytes): + movl %eax, -24(%edx) +L(write_20bytes): + movl %eax, -20(%edx) +L(write_16bytes): + movl %eax, -16(%edx) +L(write_12bytes): + movl %eax, -12(%edx) +L(write_8bytes): + movl %eax, -8(%edx) +L(write_4bytes): + movl %eax, -4(%edx) +L(write_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(write_29bytes): + movl %eax, -29(%edx) +L(write_25bytes): + movl %eax, -25(%edx) +L(write_21bytes): + movl %eax, -21(%edx) +L(write_17bytes): + movl %eax, -17(%edx) +L(write_13bytes): + movl %eax, -13(%edx) +L(write_9bytes): + movl %eax, -9(%edx) +L(write_5bytes): + movl %eax, -5(%edx) +L(write_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_30bytes): + movl %eax, -30(%edx) +L(write_26bytes): + movl %eax, -26(%edx) +L(write_22bytes): + movl %eax, -22(%edx) +L(write_18bytes): + movl %eax, -18(%edx) +L(write_14bytes): + movl %eax, -14(%edx) +L(write_10bytes): + movl %eax, -10(%edx) +L(write_6bytes): + movl %eax, -6(%edx) +L(write_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_31bytes): + movl %eax, -31(%edx) +L(write_27bytes): + movl %eax, -27(%edx) +L(write_23bytes): + movl %eax, -23(%edx) +L(write_19bytes): + movl %eax, -19(%edx) +L(write_15bytes): + movl %eax, -15(%edx) +L(write_11bytes): + movl %eax, -11(%edx) +L(write_7bytes): + movl %eax, -7(%edx) +L(write_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(32bytesormore): + /* Fill xmm0 with the pattern. */ +#ifdef USE_AS_BZERO + pxor %xmm0, %xmm0 +#else + movd %eax, %xmm0 + pshufd $0, %xmm0, %xmm0 +#endif + testl $0xf, %edx + jz L(aligned_16) +/* ECX > 32 and EDX is not 16 byte aligned. */ +L(not_aligned_16): + movdqu %xmm0, (%edx) + movl %edx, %eax + and $-16, %edx + add $16, %edx + sub %edx, %eax + add %eax, %ecx + movd %xmm0, %eax + + ALIGN (4) +L(aligned_16): + cmp $128, %ecx + jae L(128bytesormore) + +L(aligned_16_less128bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytesormore): + PUSH (%edi) +#ifdef DATA_CACHE_SIZE + PUSH (%ebx) + mov $DATA_CACHE_SIZE, %ebx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_data_cache_size@GOTOFF(%ebx), %ebx +# else + PUSH (%ebx) + mov __x86_data_cache_size, %ebx +# endif +#endif + mov %ebx, %edi + shr $4, %ebx + sub %ebx, %edi +#if defined DATA_CACHE_SIZE || !defined SHARED + POP (%ebx) +#endif +/* + * When data size approximate the end of L1 cache, + * fast string will prefetch and combine data efficiently. + */ + cmp %edi, %ecx + jae L(128bytesormore_endof_L1) + subl $128, %ecx +L(128bytesormore_normal): + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jb L(128bytesless_normal) + + + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jae L(128bytesormore_normal) + +L(128bytesless_normal): + POP (%edi) + add $128, %ecx + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + CFI_PUSH (%edi) + ALIGN (4) +L(128bytesormore_endof_L1): + mov %edx, %edi + mov %ecx, %edx + shr $2, %ecx + and $3, %edx + rep stosl + jz L(copy_page_by_rep_exit) + cmp $2, %edx + jb L(copy_page_by_rep_left_1) + movw %ax, (%edi) + add $2, %edi + sub $2, %edx + jz L(copy_page_by_rep_exit) +L(copy_page_by_rep_left_1): + movb %al, (%edi) +L(copy_page_by_rep_exit): + POP (%edi) + SETRTNVAL + RETURN + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_16_128bytes): + .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) + .popsection + + ALIGN (4) +L(aligned_16_112bytes): + movdqa %xmm0, -112(%edx) +L(aligned_16_96bytes): + movdqa %xmm0, -96(%edx) +L(aligned_16_80bytes): + movdqa %xmm0, -80(%edx) +L(aligned_16_64bytes): + movdqa %xmm0, -64(%edx) +L(aligned_16_48bytes): + movdqa %xmm0, -48(%edx) +L(aligned_16_32bytes): + movdqa %xmm0, -32(%edx) +L(aligned_16_16bytes): + movdqa %xmm0, -16(%edx) +L(aligned_16_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_113bytes): + movdqa %xmm0, -113(%edx) +L(aligned_16_97bytes): + movdqa %xmm0, -97(%edx) +L(aligned_16_81bytes): + movdqa %xmm0, -81(%edx) +L(aligned_16_65bytes): + movdqa %xmm0, -65(%edx) +L(aligned_16_49bytes): + movdqa %xmm0, -49(%edx) +L(aligned_16_33bytes): + movdqa %xmm0, -33(%edx) +L(aligned_16_17bytes): + movdqa %xmm0, -17(%edx) +L(aligned_16_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_114bytes): + movdqa %xmm0, -114(%edx) +L(aligned_16_98bytes): + movdqa %xmm0, -98(%edx) +L(aligned_16_82bytes): + movdqa %xmm0, -82(%edx) +L(aligned_16_66bytes): + movdqa %xmm0, -66(%edx) +L(aligned_16_50bytes): + movdqa %xmm0, -50(%edx) +L(aligned_16_34bytes): + movdqa %xmm0, -34(%edx) +L(aligned_16_18bytes): + movdqa %xmm0, -18(%edx) +L(aligned_16_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_115bytes): + movdqa %xmm0, -115(%edx) +L(aligned_16_99bytes): + movdqa %xmm0, -99(%edx) +L(aligned_16_83bytes): + movdqa %xmm0, -83(%edx) +L(aligned_16_67bytes): + movdqa %xmm0, -67(%edx) +L(aligned_16_51bytes): + movdqa %xmm0, -51(%edx) +L(aligned_16_35bytes): + movdqa %xmm0, -35(%edx) +L(aligned_16_19bytes): + movdqa %xmm0, -19(%edx) +L(aligned_16_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_116bytes): + movdqa %xmm0, -116(%edx) +L(aligned_16_100bytes): + movdqa %xmm0, -100(%edx) +L(aligned_16_84bytes): + movdqa %xmm0, -84(%edx) +L(aligned_16_68bytes): + movdqa %xmm0, -68(%edx) +L(aligned_16_52bytes): + movdqa %xmm0, -52(%edx) +L(aligned_16_36bytes): + movdqa %xmm0, -36(%edx) +L(aligned_16_20bytes): + movdqa %xmm0, -20(%edx) +L(aligned_16_4bytes): + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_117bytes): + movdqa %xmm0, -117(%edx) +L(aligned_16_101bytes): + movdqa %xmm0, -101(%edx) +L(aligned_16_85bytes): + movdqa %xmm0, -85(%edx) +L(aligned_16_69bytes): + movdqa %xmm0, -69(%edx) +L(aligned_16_53bytes): + movdqa %xmm0, -53(%edx) +L(aligned_16_37bytes): + movdqa %xmm0, -37(%edx) +L(aligned_16_21bytes): + movdqa %xmm0, -21(%edx) +L(aligned_16_5bytes): + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_118bytes): + movdqa %xmm0, -118(%edx) +L(aligned_16_102bytes): + movdqa %xmm0, -102(%edx) +L(aligned_16_86bytes): + movdqa %xmm0, -86(%edx) +L(aligned_16_70bytes): + movdqa %xmm0, -70(%edx) +L(aligned_16_54bytes): + movdqa %xmm0, -54(%edx) +L(aligned_16_38bytes): + movdqa %xmm0, -38(%edx) +L(aligned_16_22bytes): + movdqa %xmm0, -22(%edx) +L(aligned_16_6bytes): + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_119bytes): + movdqa %xmm0, -119(%edx) +L(aligned_16_103bytes): + movdqa %xmm0, -103(%edx) +L(aligned_16_87bytes): + movdqa %xmm0, -87(%edx) +L(aligned_16_71bytes): + movdqa %xmm0, -71(%edx) +L(aligned_16_55bytes): + movdqa %xmm0, -55(%edx) +L(aligned_16_39bytes): + movdqa %xmm0, -39(%edx) +L(aligned_16_23bytes): + movdqa %xmm0, -23(%edx) +L(aligned_16_7bytes): + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_120bytes): + movdqa %xmm0, -120(%edx) +L(aligned_16_104bytes): + movdqa %xmm0, -104(%edx) +L(aligned_16_88bytes): + movdqa %xmm0, -88(%edx) +L(aligned_16_72bytes): + movdqa %xmm0, -72(%edx) +L(aligned_16_56bytes): + movdqa %xmm0, -56(%edx) +L(aligned_16_40bytes): + movdqa %xmm0, -40(%edx) +L(aligned_16_24bytes): + movdqa %xmm0, -24(%edx) +L(aligned_16_8bytes): + movq %xmm0, -8(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_121bytes): + movdqa %xmm0, -121(%edx) +L(aligned_16_105bytes): + movdqa %xmm0, -105(%edx) +L(aligned_16_89bytes): + movdqa %xmm0, -89(%edx) +L(aligned_16_73bytes): + movdqa %xmm0, -73(%edx) +L(aligned_16_57bytes): + movdqa %xmm0, -57(%edx) +L(aligned_16_41bytes): + movdqa %xmm0, -41(%edx) +L(aligned_16_25bytes): + movdqa %xmm0, -25(%edx) +L(aligned_16_9bytes): + movq %xmm0, -9(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_122bytes): + movdqa %xmm0, -122(%edx) +L(aligned_16_106bytes): + movdqa %xmm0, -106(%edx) +L(aligned_16_90bytes): + movdqa %xmm0, -90(%edx) +L(aligned_16_74bytes): + movdqa %xmm0, -74(%edx) +L(aligned_16_58bytes): + movdqa %xmm0, -58(%edx) +L(aligned_16_42bytes): + movdqa %xmm0, -42(%edx) +L(aligned_16_26bytes): + movdqa %xmm0, -26(%edx) +L(aligned_16_10bytes): + movq %xmm0, -10(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_123bytes): + movdqa %xmm0, -123(%edx) +L(aligned_16_107bytes): + movdqa %xmm0, -107(%edx) +L(aligned_16_91bytes): + movdqa %xmm0, -91(%edx) +L(aligned_16_75bytes): + movdqa %xmm0, -75(%edx) +L(aligned_16_59bytes): + movdqa %xmm0, -59(%edx) +L(aligned_16_43bytes): + movdqa %xmm0, -43(%edx) +L(aligned_16_27bytes): + movdqa %xmm0, -27(%edx) +L(aligned_16_11bytes): + movq %xmm0, -11(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_124bytes): + movdqa %xmm0, -124(%edx) +L(aligned_16_108bytes): + movdqa %xmm0, -108(%edx) +L(aligned_16_92bytes): + movdqa %xmm0, -92(%edx) +L(aligned_16_76bytes): + movdqa %xmm0, -76(%edx) +L(aligned_16_60bytes): + movdqa %xmm0, -60(%edx) +L(aligned_16_44bytes): + movdqa %xmm0, -44(%edx) +L(aligned_16_28bytes): + movdqa %xmm0, -28(%edx) +L(aligned_16_12bytes): + movq %xmm0, -12(%edx) + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_125bytes): + movdqa %xmm0, -125(%edx) +L(aligned_16_109bytes): + movdqa %xmm0, -109(%edx) +L(aligned_16_93bytes): + movdqa %xmm0, -93(%edx) +L(aligned_16_77bytes): + movdqa %xmm0, -77(%edx) +L(aligned_16_61bytes): + movdqa %xmm0, -61(%edx) +L(aligned_16_45bytes): + movdqa %xmm0, -45(%edx) +L(aligned_16_29bytes): + movdqa %xmm0, -29(%edx) +L(aligned_16_13bytes): + movq %xmm0, -13(%edx) + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_126bytes): + movdqa %xmm0, -126(%edx) +L(aligned_16_110bytes): + movdqa %xmm0, -110(%edx) +L(aligned_16_94bytes): + movdqa %xmm0, -94(%edx) +L(aligned_16_78bytes): + movdqa %xmm0, -78(%edx) +L(aligned_16_62bytes): + movdqa %xmm0, -62(%edx) +L(aligned_16_46bytes): + movdqa %xmm0, -46(%edx) +L(aligned_16_30bytes): + movdqa %xmm0, -30(%edx) +L(aligned_16_14bytes): + movq %xmm0, -14(%edx) + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_127bytes): + movdqa %xmm0, -127(%edx) +L(aligned_16_111bytes): + movdqa %xmm0, -111(%edx) +L(aligned_16_95bytes): + movdqa %xmm0, -95(%edx) +L(aligned_16_79bytes): + movdqa %xmm0, -79(%edx) +L(aligned_16_63bytes): + movdqa %xmm0, -63(%edx) +L(aligned_16_47bytes): + movdqa %xmm0, -47(%edx) +L(aligned_16_31bytes): + movdqa %xmm0, -31(%edx) +L(aligned_16_15bytes): + movq %xmm0, -15(%edx) + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN_END + +END (__memset_sse2_rep) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S new file mode 100644 index 0000000000..d7b8be9114 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S @@ -0,0 +1,860 @@ +/* memset with SSE2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_BZERO +# define DEST PARMS +# define LEN DEST+4 +# define SETRTNVAL +#else +# define DEST PARMS +# define CHR DEST+4 +# define LEN CHR+4 +# define SETRTNVAL movl DEST(%esp), %eax +#endif + +#ifdef SHARED +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define PARMS 8 /* Preserve EBX. */ +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + add $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + add (%ebx,%ecx,4), %ebx; \ + add %ecx, %edx; \ + /* We loaded the jump table and adjusted EDX. Go. */ \ + jmp *%ebx +#else +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define PARMS 4 +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + add %ecx, %edx; \ + jmp *TABLE(,%ecx,4) +#endif + + .section .text.sse2,"ax",@progbits +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO +ENTRY (__memset_chk_sse2) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_sse2) +#endif +ENTRY (__memset_sse2) + ENTRANCE + + movl LEN(%esp), %ecx +#ifdef USE_AS_BZERO + xor %eax, %eax +#else + movzbl CHR(%esp), %eax + movb %al, %ah + /* Fill the whole EAX with pattern. */ + movl %eax, %edx + shl $16, %eax + or %edx, %eax +#endif + movl DEST(%esp), %edx + cmp $32, %ecx + jae L(32bytesormore) + +L(write_less32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_less_32bytes): + .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) + .popsection + + ALIGN (4) +L(write_28bytes): + movl %eax, -28(%edx) +L(write_24bytes): + movl %eax, -24(%edx) +L(write_20bytes): + movl %eax, -20(%edx) +L(write_16bytes): + movl %eax, -16(%edx) +L(write_12bytes): + movl %eax, -12(%edx) +L(write_8bytes): + movl %eax, -8(%edx) +L(write_4bytes): + movl %eax, -4(%edx) +L(write_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(write_29bytes): + movl %eax, -29(%edx) +L(write_25bytes): + movl %eax, -25(%edx) +L(write_21bytes): + movl %eax, -21(%edx) +L(write_17bytes): + movl %eax, -17(%edx) +L(write_13bytes): + movl %eax, -13(%edx) +L(write_9bytes): + movl %eax, -9(%edx) +L(write_5bytes): + movl %eax, -5(%edx) +L(write_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_30bytes): + movl %eax, -30(%edx) +L(write_26bytes): + movl %eax, -26(%edx) +L(write_22bytes): + movl %eax, -22(%edx) +L(write_18bytes): + movl %eax, -18(%edx) +L(write_14bytes): + movl %eax, -14(%edx) +L(write_10bytes): + movl %eax, -10(%edx) +L(write_6bytes): + movl %eax, -6(%edx) +L(write_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_31bytes): + movl %eax, -31(%edx) +L(write_27bytes): + movl %eax, -27(%edx) +L(write_23bytes): + movl %eax, -23(%edx) +L(write_19bytes): + movl %eax, -19(%edx) +L(write_15bytes): + movl %eax, -15(%edx) +L(write_11bytes): + movl %eax, -11(%edx) +L(write_7bytes): + movl %eax, -7(%edx) +L(write_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(32bytesormore): + /* Fill xmm0 with the pattern. */ +#ifdef USE_AS_BZERO + pxor %xmm0, %xmm0 +#else + movd %eax, %xmm0 + pshufd $0, %xmm0, %xmm0 +#endif + testl $0xf, %edx + jz L(aligned_16) +/* ECX > 32 and EDX is not 16 byte aligned. */ +L(not_aligned_16): + movdqu %xmm0, (%edx) + movl %edx, %eax + and $-16, %edx + add $16, %edx + sub %edx, %eax + add %eax, %ecx + movd %xmm0, %eax + + ALIGN (4) +L(aligned_16): + cmp $128, %ecx + jae L(128bytesormore) + +L(aligned_16_less128bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytesormore): +#ifdef SHARED_CACHE_SIZE + PUSH (%ebx) + mov $SHARED_CACHE_SIZE, %ebx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx +# else + PUSH (%ebx) + mov __x86_shared_cache_size, %ebx +# endif +#endif + cmp %ebx, %ecx + jae L(128bytesormore_nt_start) + + +#ifdef DATA_CACHE_SIZE + POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) + cmp $DATA_CACHE_SIZE, %ecx +#else +# ifdef SHARED +# define RESTORE_EBX_STATE + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx +# else + POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) + cmp __x86_data_cache_size, %ecx +# endif +#endif + + jae L(128bytes_L2_normal) + subl $128, %ecx +L(128bytesormore_normal): + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jb L(128bytesless_normal) + + + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jae L(128bytesormore_normal) + +L(128bytesless_normal): + add $128, %ecx + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytes_L2_normal): + prefetcht0 0x380(%edx) + prefetcht0 0x3c0(%edx) + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm0, 0x10(%edx) + movaps %xmm0, 0x20(%edx) + movaps %xmm0, 0x30(%edx) + movaps %xmm0, 0x40(%edx) + movaps %xmm0, 0x50(%edx) + movaps %xmm0, 0x60(%edx) + movaps %xmm0, 0x70(%edx) + add $128, %edx + cmp $128, %ecx + jae L(128bytes_L2_normal) + +L(128bytesless_L2_normal): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + RESTORE_EBX_STATE +L(128bytesormore_nt_start): + sub %ebx, %ecx + ALIGN (4) +L(128bytesormore_shared_cache_loop): + prefetcht0 0x3c0(%edx) + prefetcht0 0x380(%edx) + sub $0x80, %ebx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + add $0x80, %edx + cmp $0x80, %ebx + jae L(128bytesormore_shared_cache_loop) + cmp $0x80, %ecx + jb L(shared_cache_loop_end) + ALIGN (4) +L(128bytesormore_nt): + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm0, 0x10(%edx) + movntdq %xmm0, 0x20(%edx) + movntdq %xmm0, 0x30(%edx) + movntdq %xmm0, 0x40(%edx) + movntdq %xmm0, 0x50(%edx) + movntdq %xmm0, 0x60(%edx) + movntdq %xmm0, 0x70(%edx) + add $0x80, %edx + cmp $0x80, %ecx + jae L(128bytesormore_nt) + sfence +L(shared_cache_loop_end): +#if defined DATA_CACHE_SIZE || !defined SHARED + POP (%ebx) +#endif + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_16_128bytes): + .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) + .popsection + + ALIGN (4) +L(aligned_16_112bytes): + movdqa %xmm0, -112(%edx) +L(aligned_16_96bytes): + movdqa %xmm0, -96(%edx) +L(aligned_16_80bytes): + movdqa %xmm0, -80(%edx) +L(aligned_16_64bytes): + movdqa %xmm0, -64(%edx) +L(aligned_16_48bytes): + movdqa %xmm0, -48(%edx) +L(aligned_16_32bytes): + movdqa %xmm0, -32(%edx) +L(aligned_16_16bytes): + movdqa %xmm0, -16(%edx) +L(aligned_16_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_113bytes): + movdqa %xmm0, -113(%edx) +L(aligned_16_97bytes): + movdqa %xmm0, -97(%edx) +L(aligned_16_81bytes): + movdqa %xmm0, -81(%edx) +L(aligned_16_65bytes): + movdqa %xmm0, -65(%edx) +L(aligned_16_49bytes): + movdqa %xmm0, -49(%edx) +L(aligned_16_33bytes): + movdqa %xmm0, -33(%edx) +L(aligned_16_17bytes): + movdqa %xmm0, -17(%edx) +L(aligned_16_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_114bytes): + movdqa %xmm0, -114(%edx) +L(aligned_16_98bytes): + movdqa %xmm0, -98(%edx) +L(aligned_16_82bytes): + movdqa %xmm0, -82(%edx) +L(aligned_16_66bytes): + movdqa %xmm0, -66(%edx) +L(aligned_16_50bytes): + movdqa %xmm0, -50(%edx) +L(aligned_16_34bytes): + movdqa %xmm0, -34(%edx) +L(aligned_16_18bytes): + movdqa %xmm0, -18(%edx) +L(aligned_16_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_115bytes): + movdqa %xmm0, -115(%edx) +L(aligned_16_99bytes): + movdqa %xmm0, -99(%edx) +L(aligned_16_83bytes): + movdqa %xmm0, -83(%edx) +L(aligned_16_67bytes): + movdqa %xmm0, -67(%edx) +L(aligned_16_51bytes): + movdqa %xmm0, -51(%edx) +L(aligned_16_35bytes): + movdqa %xmm0, -35(%edx) +L(aligned_16_19bytes): + movdqa %xmm0, -19(%edx) +L(aligned_16_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_116bytes): + movdqa %xmm0, -116(%edx) +L(aligned_16_100bytes): + movdqa %xmm0, -100(%edx) +L(aligned_16_84bytes): + movdqa %xmm0, -84(%edx) +L(aligned_16_68bytes): + movdqa %xmm0, -68(%edx) +L(aligned_16_52bytes): + movdqa %xmm0, -52(%edx) +L(aligned_16_36bytes): + movdqa %xmm0, -36(%edx) +L(aligned_16_20bytes): + movdqa %xmm0, -20(%edx) +L(aligned_16_4bytes): + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_117bytes): + movdqa %xmm0, -117(%edx) +L(aligned_16_101bytes): + movdqa %xmm0, -101(%edx) +L(aligned_16_85bytes): + movdqa %xmm0, -85(%edx) +L(aligned_16_69bytes): + movdqa %xmm0, -69(%edx) +L(aligned_16_53bytes): + movdqa %xmm0, -53(%edx) +L(aligned_16_37bytes): + movdqa %xmm0, -37(%edx) +L(aligned_16_21bytes): + movdqa %xmm0, -21(%edx) +L(aligned_16_5bytes): + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_118bytes): + movdqa %xmm0, -118(%edx) +L(aligned_16_102bytes): + movdqa %xmm0, -102(%edx) +L(aligned_16_86bytes): + movdqa %xmm0, -86(%edx) +L(aligned_16_70bytes): + movdqa %xmm0, -70(%edx) +L(aligned_16_54bytes): + movdqa %xmm0, -54(%edx) +L(aligned_16_38bytes): + movdqa %xmm0, -38(%edx) +L(aligned_16_22bytes): + movdqa %xmm0, -22(%edx) +L(aligned_16_6bytes): + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_119bytes): + movdqa %xmm0, -119(%edx) +L(aligned_16_103bytes): + movdqa %xmm0, -103(%edx) +L(aligned_16_87bytes): + movdqa %xmm0, -87(%edx) +L(aligned_16_71bytes): + movdqa %xmm0, -71(%edx) +L(aligned_16_55bytes): + movdqa %xmm0, -55(%edx) +L(aligned_16_39bytes): + movdqa %xmm0, -39(%edx) +L(aligned_16_23bytes): + movdqa %xmm0, -23(%edx) +L(aligned_16_7bytes): + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_120bytes): + movdqa %xmm0, -120(%edx) +L(aligned_16_104bytes): + movdqa %xmm0, -104(%edx) +L(aligned_16_88bytes): + movdqa %xmm0, -88(%edx) +L(aligned_16_72bytes): + movdqa %xmm0, -72(%edx) +L(aligned_16_56bytes): + movdqa %xmm0, -56(%edx) +L(aligned_16_40bytes): + movdqa %xmm0, -40(%edx) +L(aligned_16_24bytes): + movdqa %xmm0, -24(%edx) +L(aligned_16_8bytes): + movq %xmm0, -8(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_121bytes): + movdqa %xmm0, -121(%edx) +L(aligned_16_105bytes): + movdqa %xmm0, -105(%edx) +L(aligned_16_89bytes): + movdqa %xmm0, -89(%edx) +L(aligned_16_73bytes): + movdqa %xmm0, -73(%edx) +L(aligned_16_57bytes): + movdqa %xmm0, -57(%edx) +L(aligned_16_41bytes): + movdqa %xmm0, -41(%edx) +L(aligned_16_25bytes): + movdqa %xmm0, -25(%edx) +L(aligned_16_9bytes): + movq %xmm0, -9(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_122bytes): + movdqa %xmm0, -122(%edx) +L(aligned_16_106bytes): + movdqa %xmm0, -106(%edx) +L(aligned_16_90bytes): + movdqa %xmm0, -90(%edx) +L(aligned_16_74bytes): + movdqa %xmm0, -74(%edx) +L(aligned_16_58bytes): + movdqa %xmm0, -58(%edx) +L(aligned_16_42bytes): + movdqa %xmm0, -42(%edx) +L(aligned_16_26bytes): + movdqa %xmm0, -26(%edx) +L(aligned_16_10bytes): + movq %xmm0, -10(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_123bytes): + movdqa %xmm0, -123(%edx) +L(aligned_16_107bytes): + movdqa %xmm0, -107(%edx) +L(aligned_16_91bytes): + movdqa %xmm0, -91(%edx) +L(aligned_16_75bytes): + movdqa %xmm0, -75(%edx) +L(aligned_16_59bytes): + movdqa %xmm0, -59(%edx) +L(aligned_16_43bytes): + movdqa %xmm0, -43(%edx) +L(aligned_16_27bytes): + movdqa %xmm0, -27(%edx) +L(aligned_16_11bytes): + movq %xmm0, -11(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_124bytes): + movdqa %xmm0, -124(%edx) +L(aligned_16_108bytes): + movdqa %xmm0, -108(%edx) +L(aligned_16_92bytes): + movdqa %xmm0, -92(%edx) +L(aligned_16_76bytes): + movdqa %xmm0, -76(%edx) +L(aligned_16_60bytes): + movdqa %xmm0, -60(%edx) +L(aligned_16_44bytes): + movdqa %xmm0, -44(%edx) +L(aligned_16_28bytes): + movdqa %xmm0, -28(%edx) +L(aligned_16_12bytes): + movq %xmm0, -12(%edx) + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_125bytes): + movdqa %xmm0, -125(%edx) +L(aligned_16_109bytes): + movdqa %xmm0, -109(%edx) +L(aligned_16_93bytes): + movdqa %xmm0, -93(%edx) +L(aligned_16_77bytes): + movdqa %xmm0, -77(%edx) +L(aligned_16_61bytes): + movdqa %xmm0, -61(%edx) +L(aligned_16_45bytes): + movdqa %xmm0, -45(%edx) +L(aligned_16_29bytes): + movdqa %xmm0, -29(%edx) +L(aligned_16_13bytes): + movq %xmm0, -13(%edx) + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_126bytes): + movdqa %xmm0, -126(%edx) +L(aligned_16_110bytes): + movdqa %xmm0, -110(%edx) +L(aligned_16_94bytes): + movdqa %xmm0, -94(%edx) +L(aligned_16_78bytes): + movdqa %xmm0, -78(%edx) +L(aligned_16_62bytes): + movdqa %xmm0, -62(%edx) +L(aligned_16_46bytes): + movdqa %xmm0, -46(%edx) +L(aligned_16_30bytes): + movdqa %xmm0, -30(%edx) +L(aligned_16_14bytes): + movq %xmm0, -14(%edx) + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_127bytes): + movdqa %xmm0, -127(%edx) +L(aligned_16_111bytes): + movdqa %xmm0, -111(%edx) +L(aligned_16_95bytes): + movdqa %xmm0, -95(%edx) +L(aligned_16_79bytes): + movdqa %xmm0, -79(%edx) +L(aligned_16_63bytes): + movdqa %xmm0, -63(%edx) +L(aligned_16_47bytes): + movdqa %xmm0, -47(%edx) +L(aligned_16_31bytes): + movdqa %xmm0, -31(%edx) +L(aligned_16_15bytes): + movq %xmm0, -15(%edx) + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN_END + +END (__memset_sse2) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S new file mode 100644 index 0000000000..f601663a9f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S @@ -0,0 +1,75 @@ +/* Multiple versions of memset + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(memset) + .type memset, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memset_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_sse2) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_sse2_rep) +2: ret +END(memset) + +# undef ENTRY +# define ENTRY(name) \ + .type __memset_ia32, @function; \ + .globl __memset_ia32; \ + .p2align 4; \ + __memset_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memset_ia32, .-__memset_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memset_chk_ia32, @function; \ + .globl __memset_chk_ia32; \ + .p2align 4; \ + __memset_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_ia32 +# endif + +# undef strong_alias +# define strong_alias(original, alias) +#endif + +#include "../memset.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S new file mode 100644 index 0000000000..573cf4208a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S @@ -0,0 +1,82 @@ +/* Multiple versions of __memset_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(__memset_chk) + .type __memset_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memset_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_chk_sse2) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep) +2: ret +END(__memset_chk) + +# ifdef SHARED +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +# else + .text + .type __memset_chk_sse2, @function + .p2align 4; +__memset_chk_sse2: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_sse2 + cfi_endproc + .size __memset_chk_sse2, .-__memset_chk_sse2 + + .type __memset_chk_sse2_rep, @function + .p2align 4; +__memset_chk_sse2_rep: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_sse2_rep + cfi_endproc + .size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep + + .type __memset_chk_ia32, @function + .p2align 4; +__memset_chk_ia32: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_ia32 + cfi_endproc + .size __memset_chk_ia32, .-__memset_chk_ia32 +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S new file mode 100644 index 0000000000..88c0e5776c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S @@ -0,0 +1,3 @@ +#define USE_AS_RAWMEMCHR +#define MEMCHR __rawmemchr_sse2_bsf +#include "memchr-sse2-bsf.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S new file mode 100644 index 0000000000..038c74896b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_RAWMEMCHR +#define MEMCHR __rawmemchr_sse2 +#include "memchr-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S new file mode 100644 index 0000000000..0a41d63ee8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S @@ -0,0 +1,65 @@ +/* Multiple versions of rawmemchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__rawmemchr) + .type __rawmemchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX (__rawmemchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__rawmemchr_ia32) + ret + +3: LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf) + ret +END(__rawmemchr) + +weak_alias(__rawmemchr, rawmemchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __rawmemchr_ia32, @function; \ + .globl __rawmemchr_ia32; \ + .p2align 4; \ + __rawmemchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32 + +# undef libc_hidden_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_def(name) \ + .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32 + +#endif +#include "../../rawmemchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c new file mode 100644 index 0000000000..1aa5440644 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c @@ -0,0 +1 @@ +#include <string/strnlen.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c new file mode 100644 index 0000000000..2e9619f97c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c @@ -0,0 +1,27 @@ +/* FMA version of fma. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +double +__fma_fma (double x, double y, double z) +{ + asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c new file mode 100644 index 0000000000..411ebb2ba9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c @@ -0,0 +1,34 @@ +/* Multiple versions of fma. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <math.h> +#include <init-arch.h> + +extern double __fma_ia32 (double x, double y, double z) attribute_hidden; +extern double __fma_fma (double x, double y, double z) attribute_hidden; + +libm_ifunc (__fma, + HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma : __fma_ia32); +weak_alias (__fma, fma) + +#define __fma __fma_ia32 + +#include <sysdeps/ieee754/ldbl-96/s_fma.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c new file mode 100644 index 0000000000..ee57abfda2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c @@ -0,0 +1,27 @@ +/* FMA version of fmaf. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +float +__fmaf_fma (float x, float y, float z) +{ + asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c new file mode 100644 index 0000000000..00b0fbcfc5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c @@ -0,0 +1,34 @@ +/* Multiple versions of fmaf. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <math.h> +#include <init-arch.h> + +extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden; +extern float __fmaf_fma (float x, float y, float z) attribute_hidden; + +libm_ifunc (__fmaf, + HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma : __fmaf_ia32); +weak_alias (__fmaf, fmaf) + +#define __fmaf __fmaf_ia32 + +#include <sysdeps/ieee754/dbl-64/s_fmaf.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c new file mode 100644 index 0000000000..7db31b02f8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/sched_cpucount.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S new file mode 100644 index 0000000000..46ca1b3074 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_sse2 +#include "strcpy-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S new file mode 100644 index 0000000000..d971c2da38 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S new file mode 100644 index 0000000000..ee81ab6ae3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S @@ -0,0 +1,9 @@ +/* Multiple versions of stpcpy + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S new file mode 100644 index 0000000000..37a703cb76 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_sse2 +#include "strcpy-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S new file mode 100644 index 0000000000..14ed16f6b5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S new file mode 100644 index 0000000000..2698ca6a8c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S @@ -0,0 +1,8 @@ +/* Multiple versions of stpncpy + All versions must be listed in ifunc-impl-list.c. */ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c new file mode 100644 index 0000000000..753c6ec84a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c @@ -0,0 +1,12 @@ +#include <string.h> + +extern __typeof (strcasecmp) __strcasecmp_nonascii; + +#define __strcasecmp __strcasecmp_nonascii +#include <string/strcasecmp.c> + +strong_alias (__strcasecmp_nonascii, __strcasecmp_ia32) + +/* The needs of strcasecmp in libc are minimal, no need to go through + the IFUNC. */ +strong_alias (__strcasecmp_nonascii, __GI___strcasecmp) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S new file mode 100644 index 0000000000..ec59276408 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S @@ -0,0 +1,39 @@ +/* Entry point for multi-version x86 strcasecmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY(__strcasecmp) + .type __strcasecmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strcasecmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2) +2: ret +END(__strcasecmp) + +weak_alias (__strcasecmp, strcasecmp) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c new file mode 100644 index 0000000000..d4fcd2b4a1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c @@ -0,0 +1,13 @@ +#include <string.h> + +extern __typeof (strcasecmp_l) __strcasecmp_l_nonascii; + +#define __strcasecmp_l __strcasecmp_l_nonascii +#define USE_IN_EXTENDED_LOCALE_MODEL 1 +#include <string/strcasecmp.c> + +strong_alias (__strcasecmp_l_nonascii, __strcasecmp_l_ia32) + +/* The needs of strcasecmp in libc are minimal, no need to go through + the IFUNC. */ +strong_alias (__strcasecmp_l_nonascii, __GI___strcasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S new file mode 100644 index 0000000000..411d4153f2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S @@ -0,0 +1,2 @@ +#define USE_AS_STRCASECMP_L 1 +#include "strcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S new file mode 100644 index 0000000000..a22b93c518 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S @@ -0,0 +1,2 @@ +#define USE_AS_STRCASECMP_L 1 +#include "strcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S new file mode 100644 index 0000000000..711c09b0dc --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S @@ -0,0 +1,7 @@ +/* Multiple versions of strcasecmp_l + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP __strcasecmp_l +#define USE_AS_STRCASECMP_L +#include "strcmp.S" + +weak_alias (__strcasecmp_l, strcasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S new file mode 100644 index 0000000000..6359c7330c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S @@ -0,0 +1,1245 @@ +/* strcat with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# include <sysdep.h> + + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + SETUP_PIC_REG(cx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjusted ECX. Go. */ \ + jmp *%ecx +# else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + +# ifndef STRCAT +# define STRCAT __strcat_sse2 +# endif + +# define PARMS 4 +# define STR1 PARMS+4 +# define STR2 STR1+4 + +# ifdef USE_AS_STRNCAT +# define LEN STR2+8 +# define STR3 STR1+4 +# else +# define STR3 STR1 +# endif + +# define USE_AS_STRCAT +# ifdef USE_AS_STRNCAT +# define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi); +# else +# define RETURN POP(%esi); ret; CFI_PUSH(%esi); +# endif + +.text +ENTRY (STRCAT) + PUSH (%esi) + mov STR1(%esp), %eax + mov STR2(%esp), %esi +# ifdef USE_AS_STRNCAT + PUSH (%ebx) + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) +# endif + cmpb $0, (%esi) + mov %esi, %ecx + mov %eax, %edx + jz L(ExitZero) + + and $63, %ecx + and $63, %edx + cmp $32, %ecx + ja L(StrlenCore7_1) + cmp $48, %edx + ja L(alignment_prolog) + + pxor %xmm0, %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm7, %xmm7 + movdqu (%eax), %xmm1 + movdqu (%esi), %xmm5 + pcmpeqb %xmm1, %xmm0 + movdqu 16(%esi), %xmm6 + pmovmskb %xmm0, %ecx + pcmpeqb %xmm5, %xmm4 + pcmpeqb %xmm6, %xmm7 + test %ecx, %ecx + jnz L(exit_less16_) + mov %eax, %ecx + and $-16, %eax + jmp L(loop_prolog) + +L(alignment_prolog): + pxor %xmm0, %xmm0 + pxor %xmm4, %xmm4 + mov %edx, %ecx + pxor %xmm7, %xmm7 + and $15, %ecx + and $-16, %eax + pcmpeqb (%eax), %xmm0 + movdqu (%esi), %xmm5 + movdqu 16(%esi), %xmm6 + pmovmskb %xmm0, %edx + pcmpeqb %xmm5, %xmm4 + shr %cl, %edx + pcmpeqb %xmm6, %xmm7 + test %edx, %edx + jnz L(exit_less16) + add %eax, %ecx + + pxor %xmm0, %xmm0 +L(loop_prolog): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + .p2align 4 +L(align16_loop): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop) + bsf %edx, %edx + add %edx, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit16): + bsf %edx, %edx + lea 16(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit32): + bsf %edx, %edx + lea 32(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit48): + bsf %edx, %edx + lea 48(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_less16): + bsf %edx, %edx + add %ecx, %eax + add %edx, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_less16_): + bsf %ecx, %ecx + add %ecx, %eax + + .p2align 4 +L(StartStrcpyPart): + pmovmskb %xmm4, %edx +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + movdqu %xmm5, (%eax) + pmovmskb %xmm7, %edx +# ifdef USE_AS_STRNCAT + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + mov %esi, %ecx + and $-16, %esi + and $15, %ecx + pxor %xmm0, %xmm0 +# ifdef USE_AS_STRNCAT + add %ecx, %ebx + sbb %edx, %edx + or %edx, %ebx +# endif + sub %ecx, %eax + jmp L(Unalign16Both) + +L(StrlenCore7_1): + mov %eax, %ecx + pxor %xmm0, %xmm0 + and $15, %ecx + and $-16, %eax + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + shr %cl, %edx + test %edx, %edx + jnz L(exit_less16_1) + add %eax, %ecx + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + .p2align 4 +L(align16_loop_1): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16_1) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32_1) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48_1) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop_1) + bsf %edx, %edx + add %edx, %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit16_1): + bsf %edx, %edx + lea 16(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit32_1): + bsf %edx, %edx + lea 32(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit48_1): + bsf %edx, %edx + lea 48(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit_less16_1): + bsf %edx, %edx + add %ecx, %eax + add %edx, %eax + + .p2align 4 +L(StartStrcpyPart_1): + mov %esi, %ecx + and $15, %ecx + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +# ifdef USE_AS_STRNCAT + cmp $48, %ebx + ja L(BigN) +# endif + pcmpeqb (%esi), %xmm1 +# ifdef USE_AS_STRNCAT + add %ecx, %ebx +# endif + pmovmskb %xmm1, %edx + shr %cl, %edx +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STRNCAT + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%eax) + sub %ecx, %eax + + .p2align 4 +L(Unalign16Both): + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +L(Unalign16BothBigN): + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%eax, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%eax, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%eax, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%eax, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm3, (%eax, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %eax +# ifdef USE_AS_STRNCAT + lea 128(%ebx, %edx), %ebx +# endif + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +# ifdef USE_AS_STRNCAT + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jnz L(Unaligned64Leave) + + .p2align 4 +L(Unaligned64Loop_start): + add $64, %eax + add $64, %esi + movdqu %xmm4, -64(%eax) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%eax) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%eax) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%eax) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +# ifdef USE_AS_STRNCAT + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%eax) + movdqu %xmm5, 16(%eax) + movdqu %xmm6, 32(%eax) + add $48, %esi + add $48, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +# ifdef USE_AS_STRNCAT + .p2align 4 +L(BigN): + pcmpeqb (%esi), %xmm1 + pmovmskb %xmm1, %edx + shr %cl, %edx + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%eax) + sub %ecx, %eax + sub $48, %ebx + add %ecx, %ebx + + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + jmp L(Unalign16BothBigN) +# endif + +/*------------end of main part-------------------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16Bytes): + add %ecx, %eax + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTail): + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %eax +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%eax) + add $16, %esi + add $16, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%eax) + movdqu %xmm5, 16(%eax) + add $32, %esi + add $32, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +# ifdef USE_AS_STRNCAT + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %eax + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %eax + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %eax + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +# endif + +# ifdef USE_AS_STRNCAT + .p2align 4 +L(StrncatExit0): + movb %bh, (%eax) + mov STR3(%esp), %eax + RETURN +# endif + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit1): + movb %bh, 1(%eax) +# endif +L(Exit1): +# ifdef USE_AS_STRNCAT + movb (%esi), %dh +# endif + movb %dh, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit2): + movb %bh, 2(%eax) +# endif +L(Exit2): + movw (%esi), %dx + movw %dx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit3): + movb %bh, 3(%eax) +# endif +L(Exit3): + movw (%esi), %cx + movw %cx, (%eax) +# ifdef USE_AS_STRNCAT + movb 2(%esi), %dh +# endif + movb %dh, 2(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit4): + movb %bh, 4(%eax) +# endif +L(Exit4): + movl (%esi), %edx + movl %edx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit5): + movb %bh, 5(%eax) +# endif +L(Exit5): + movl (%esi), %ecx +# ifdef USE_AS_STRNCAT + movb 4(%esi), %dh +# endif + movb %dh, 4(%eax) + movl %ecx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit6): + movb %bh, 6(%eax) +# endif +L(Exit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%eax) + movw %dx, 4(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit7): + movb %bh, 7(%eax) +# endif +L(Exit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%eax) + movl %edx, 3(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit8): + movb %bh, 8(%eax) +# endif +L(Exit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit9): + movb %bh, 9(%eax) +# endif +L(Exit9): + movlpd (%esi), %xmm0 +# ifdef USE_AS_STRNCAT + movb 8(%esi), %dh +# endif + movb %dh, 8(%eax) + movlpd %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit10): + movb %bh, 10(%eax) +# endif +L(Exit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%eax) + movw %dx, 8(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit11): + movb %bh, 11(%eax) +# endif +L(Exit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%eax) + movl %edx, 7(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit12): + movb %bh, 12(%eax) +# endif +L(Exit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%eax) + movl %edx, 8(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit13): + movb %bh, 13(%eax) +# endif +L(Exit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 5(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit14): + movb %bh, 14(%eax) +# endif +L(Exit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 6(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit15): + movb %bh, 15(%eax) +# endif +L(Exit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 7(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit16): + movb %bh, 16(%eax) +# endif +L(Exit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit17): + movb %bh, 17(%eax) +# endif +L(Exit17): + movdqu (%esi), %xmm0 +# ifdef USE_AS_STRNCAT + movb 16(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movb %dh, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit18): + movb %bh, 18(%eax) +# endif +L(Exit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%eax) + movw %cx, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit19): + movb %bh, 19(%eax) +# endif +L(Exit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%eax) + movl %ecx, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit20): + movb %bh, 20(%eax) +# endif +L(Exit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%eax) + movl %ecx, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit21): + movb %bh, 21(%eax) +# endif +L(Exit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx +# ifdef USE_AS_STRNCAT + movb 20(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movl %ecx, 16(%eax) + movb %dh, 20(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit22): + movb %bh, 22(%eax) +# endif +L(Exit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%eax) + movlpd %xmm3, 14(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit23): + movb %bh, 23(%eax) +# endif +L(Exit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%eax) + movlpd %xmm3, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit24): + movb %bh, 24(%eax) +# endif +L(Exit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit25): + movb %bh, 25(%eax) +# endif +L(Exit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 +# ifdef USE_AS_STRNCAT + movb 24(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movb %dh, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit26): + movb %bh, 26(%eax) +# endif +L(Exit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movw %cx, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit27): + movb %bh, 27(%eax) +# endif +L(Exit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movl %ecx, 23(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit28): + movb %bh, 28(%eax) +# endif +L(Exit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movl %ecx, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit29): + movb %bh, 29(%eax) +# endif +L(Exit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 13(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit30): + movb %bh, 30(%eax) +# endif +L(Exit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 14(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit31): + movb %bh, 31(%eax) +# endif +L(Exit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit32): + movb %bh, 32(%eax) +# endif +L(Exit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 16(%eax) + mov STR3(%esp), %eax + RETURN + +# ifdef USE_AS_STRNCAT + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%eax) + xor %bh, %bh + movb %bh, 64(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm4, (%eax) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm5, 16(%eax) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm6, 32(%eax) + lea 16(%eax, %ecx), %eax + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) +# endif + .p2align 4 +L(ExitZero): + RETURN + +END (STRCAT) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +# ifdef USE_AS_STRNCAT +L(ExitStrncatTable): + .int JMPTBL(L(StrncatExit0), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit1), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit2), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit3), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit4), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit5), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit6), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit7), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit8), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit9), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit10), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit11), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit12), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit13), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit14), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit15), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit16), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit17), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit18), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit19), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit20), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit21), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit22), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit23), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit24), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit25), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit26), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit27), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit28), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit29), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit30), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit31), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit32), L(ExitStrncatTable)) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S new file mode 100644 index 0000000000..59ffbc60a5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S @@ -0,0 +1,572 @@ +/* strcat with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCAT +# define STRCAT __strcat_ssse3 +# endif + +# define PARMS 4 +# define STR1 PARMS+4 +# define STR2 STR1+4 + +# ifdef USE_AS_STRNCAT +# define LEN STR2+8 +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) + PUSH (%edi) + mov STR1(%esp), %edi + mov %edi, %edx + +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-sse2.S" + +L(StartStrcpyPart): + mov STR2(%esp), %ecx + lea (%edi, %eax), %edx +# ifdef USE_AS_STRNCAT + PUSH (%ebx) + mov LEN(%esp), %ebx + test %ebx, %ebx + jz L(StrncatExit0) + cmp $8, %ebx + jbe L(StrncatExit8Bytes) +# endif + cmpb $0, (%ecx) + jz L(Exit1) + cmpb $0, 1(%ecx) + jz L(Exit2) + cmpb $0, 2(%ecx) + jz L(Exit3) + cmpb $0, 3(%ecx) + jz L(Exit4) + cmpb $0, 4(%ecx) + jz L(Exit5) + cmpb $0, 5(%ecx) + jz L(Exit6) + cmpb $0, 6(%ecx) + jz L(Exit7) + cmpb $0, 7(%ecx) + jz L(Exit8) + cmpb $0, 8(%ecx) + jz L(Exit9) +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jb L(StrncatExit15Bytes) +# endif + cmpb $0, 9(%ecx) + jz L(Exit10) + cmpb $0, 10(%ecx) + jz L(Exit11) + cmpb $0, 11(%ecx) + jz L(Exit12) + cmpb $0, 12(%ecx) + jz L(Exit13) + cmpb $0, 13(%ecx) + jz L(Exit14) + cmpb $0, 14(%ecx) + jz L(Exit15) + cmpb $0, 15(%ecx) + jz L(Exit16) +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + je L(StrncatExit16) + +# define RETURN1 \ + POP (%ebx); \ + POP (%edi); \ + ret; \ + CFI_PUSH (%ebx); \ + CFI_PUSH (%edi) +# define USE_AS_STRNCPY +# else +# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) +# endif +# include "strcpy-ssse3.S" + .p2align 4 +L(CopyFrom1To16Bytes): + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit1): + movb %bh, 1(%edx) +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit2): + movb %bh, 2(%edx) +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit3): + movb %bh, 3(%edx) +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit4): + movb %bh, 4(%edx) +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit5): + movb %bh, 5(%edx) +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit6): + movb %bh, 6(%edx) +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit7): + movb %bh, 7(%edx) +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit8): + movb %bh, 8(%edx) +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit9): + movb %bh, 9(%edx) +L(Exit9): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit10): + movb %bh, 10(%edx) +L(Exit10): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit11): + movb %bh, 11(%edx) +L(Exit11): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit12): + movb %bh, 12(%edx) +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit13): + movb %bh, 13(%edx) +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit14): + movb %bh, 14(%edx) +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit15): + movb %bh, 15(%edx) +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit16): + movb %bh, 16(%edx) +L(Exit16): + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + +# ifdef USE_AS_STRNCPY + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %esi, %ecx + lea (%esi, %edx), %esi + lea -9(%ebx), %edx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%esi), %edx + POP (%esi) + jz L(ExitHighCase2) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %ebx + je L(StrncatExit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %ebx + je L(StrncatExit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %ebx + je L(StrncatExit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %ebx + je L(StrncatExit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %ebx + je L(StrncatExit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %ebx + je L(StrncatExit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + xor %cl, %cl + movb %cl, (%eax) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHighCase2): + test $0x01, %ah + jnz L(Exit9) + cmp $9, %ebx + je L(StrncatExit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %ebx + je L(StrncatExit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %ebx + je L(StrncatExit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %ebx + je L(StrncatExit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %ebx + je L(StrncatExit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %ebx + je L(StrncatExit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %ebx + je L(StrncatExit15) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + + CFI_PUSH(%esi) + +L(CopyFrom1To16BytesCase2OrCase3): + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %esi, %edx + add %esi, %ecx + + POP (%esi) + + cmp $8, %ebx + ja L(ExitHighCase3) + cmp $1, %ebx + je L(StrncatExit1) + cmp $2, %ebx + je L(StrncatExit2) + cmp $3, %ebx + je L(StrncatExit3) + cmp $4, %ebx + je L(StrncatExit4) + cmp $5, %ebx + je L(StrncatExit5) + cmp $6, %ebx + je L(StrncatExit6) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb %bh, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHighCase3): + cmp $9, %ebx + je L(StrncatExit9) + cmp $10, %ebx + je L(StrncatExit10) + cmp $11, %ebx + je L(StrncatExit11) + cmp $12, %ebx + je L(StrncatExit12) + cmp $13, %ebx + je L(StrncatExit13) + cmp $14, %ebx + je L(StrncatExit14) + cmp $15, %ebx + je L(StrncatExit15) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + movb %bh, 16(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit0): + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit15Bytes): + cmp $9, %ebx + je L(StrncatExit9) + cmpb $0, 9(%ecx) + jz L(Exit10) + cmp $10, %ebx + je L(StrncatExit10) + cmpb $0, 10(%ecx) + jz L(Exit11) + cmp $11, %ebx + je L(StrncatExit11) + cmpb $0, 11(%ecx) + jz L(Exit12) + cmp $12, %ebx + je L(StrncatExit12) + cmpb $0, 12(%ecx) + jz L(Exit13) + cmp $13, %ebx + je L(StrncatExit13) + cmpb $0, 13(%ecx) + jz L(Exit14) + cmp $14, %ebx + je L(StrncatExit14) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + lea 14(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + movb %bh, (%eax) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit8Bytes): + cmpb $0, (%ecx) + jz L(Exit1) + cmp $1, %ebx + je L(StrncatExit1) + cmpb $0, 1(%ecx) + jz L(Exit2) + cmp $2, %ebx + je L(StrncatExit2) + cmpb $0, 2(%ecx) + jz L(Exit3) + cmp $3, %ebx + je L(StrncatExit3) + cmpb $0, 3(%ecx) + jz L(Exit4) + cmp $4, %ebx + je L(StrncatExit4) + cmpb $0, 4(%ecx) + jz L(Exit5) + cmp $5, %ebx + je L(StrncatExit5) + cmpb $0, 5(%ecx) + jz L(Exit6) + cmp $6, %ebx + je L(StrncatExit6) + cmpb $0, 6(%ecx) + jz L(Exit7) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + movb %bh, (%eax) + movl %edi, %eax + RETURN1 + +# endif +END (STRCAT) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S new file mode 100644 index 0000000000..8412cb6f23 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S @@ -0,0 +1,92 @@ +/* Multiple versions of strcat + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef USE_AS_STRNCAT +# ifndef STRCAT +# define STRCAT strcat +# endif +#endif + +#ifdef USE_AS_STRNCAT +# define STRCAT_SSSE3 __strncat_ssse3 +# define STRCAT_SSE2 __strncat_sse2 +# define STRCAT_IA32 __strncat_ia32 +# define __GI_STRCAT __GI_strncat +#else +# define STRCAT_SSSE3 __strcat_ssse3 +# define STRCAT_SSE2 __strcat_sse2 +# define STRCAT_IA32 __strcat_ia32 +# define __GI_STRCAT __GI_strcat +#endif + + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncat in static library since we + need strncat before the initialization happened. */ +#if IS_IN (libc) + + .text +ENTRY(STRCAT) + .type STRCAT, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (STRCAT_IA32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (STRCAT_SSE2) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (STRCAT_SSSE3) +2: ret +END(STRCAT) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCAT_IA32, @function; \ + .align 16; \ + .globl STRCAT_IA32; \ + .hidden STRCAT_IA32; \ + STRCAT_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcat calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32 + +# endif +#endif + +#ifndef USE_AS_STRNCAT +# include "../../strcat.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S new file mode 100644 index 0000000000..95fd7c084e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S @@ -0,0 +1,158 @@ +/* strchr with SSE2 with bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi) +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + .text +ENTRY (__strchr_sse2_bsf) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $15, %ecx + pshufd $0, %xmm1, %xmm1 + je L(loop) + +/* Handle unaligned string. */ + and $-16, %edi + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + je L(unaligned_no_match) + /* Check which byte is a match. */ + bsf %eax, %eax + /* Is there a NULL? */ + test %edx, %edx + je L(unaligned_match) + bsf %edx, %edx + cmpl %edx, %eax + /* Return NULL if NULL comes first. */ + ja L(return_null) +L(unaligned_match): + add %edi, %eax + add %ecx, %eax + RETURN + + .p2align 4 +L(unaligned_no_match): + test %edx, %edx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %edi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + jmp L(loop) + +L(matches): + pmovmskb %xmm2, %edx + test %eax, %eax + jz L(return_null) + bsf %eax, %eax + /* There is a match. First find where NULL is. */ + test %edx, %edx + je L(match) + bsf %edx, %ecx + /* Check if NULL comes first. */ + cmpl %ecx, %eax + ja L(return_null) +L(match): + sub $16, %edi + add %edi, %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + +END (__strchr_sse2_bsf) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S new file mode 100644 index 0000000000..1f9e875b04 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S @@ -0,0 +1,348 @@ +/* strchr SSE2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi) +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__strchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $15, %ecx + pshufd $0, %xmm1, %xmm1 + je L(loop) + +/* Handle unaligned string. */ + and $-16, %edi + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + jz L(unaligned_no_match) + /* Check which byte is a match. */ + /* Is there a NULL? */ + add %ecx, %edi + test %edx, %edx + jz L(match_case1) + jmp L(match_case2) + + .p2align 4 +L(unaligned_no_match): + test %edx, %edx + jne L(return_null) + + pxor %xmm2, %xmm2 + add $16, %edi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + jmp L(loop) + +L(matches): + /* There is a match. First find where NULL is. */ + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_higth_case2) + + mov %al, %cl + and $15, %cl + jnz L(match_case2_4) + + mov %dl, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %al + jnz L(Exit5) + test $0x10, %dl + jnz L(return_null) + test $0x20, %al + jnz L(Exit6) + test $0x20, %dl + jnz L(return_null) + test $0x40, %al + jnz L(Exit7) + test $0x40, %dl + jnz L(return_null) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_4): + test $0x01, %al + jnz L(Exit1) + test $0x01, %dl + jnz L(return_null) + test $0x02, %al + jnz L(Exit2) + test $0x02, %dl + jnz L(return_null) + test $0x04, %al + jnz L(Exit3) + test $0x04, %dl + jnz L(return_null) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_higth_case2): + test %dl, %dl + jnz L(return_null) + + mov %ah, %cl + and $15, %cl + jnz L(match_case2_12) + + mov %dh, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %ah + jnz L(Exit13) + test $0x10, %dh + jnz L(return_null) + test $0x20, %ah + jnz L(Exit14) + test $0x20, %dh + jnz L(return_null) + test $0x40, %ah + jnz L(Exit15) + test $0x40, %dh + jnz L(return_null) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_12): + test $0x01, %ah + jnz L(Exit9) + test $0x01, %dh + jnz L(return_null) + test $0x02, %ah + jnz L(Exit10) + test $0x02, %dh + jnz L(return_null) + test $0x04, %ah + jnz L(Exit11) + test $0x04, %dh + jnz L(return_null) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_higth_case1) + + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_higth_case1): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(Exit1): + lea (%edi), %eax + RETURN + + .p2align 4 +L(Exit2): + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(Exit3): + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(Exit4): + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(Exit5): + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(Exit6): + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(Exit7): + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(Exit9): + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(Exit10): + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(Exit11): + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(Exit12): + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(Exit13): + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(Exit14): + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(Exit15): + lea 14(%edi), %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + +END (__strchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S new file mode 100644 index 0000000000..5b97b1c767 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S @@ -0,0 +1,57 @@ +/* Multiple versions of strchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(strchr) + .type strchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strchr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf) + HAS_ARCH_FEATURE (Slow_BSF) + jz 2f + LOAD_FUNC_GOT_EAX (__strchr_sse2) +2: ret +END(strchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __strchr_ia32, @function; \ + .globl __strchr_ia32; \ + .p2align 4; \ + __strchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strchr_ia32, .-__strchr_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strchr; __GI_strchr = __strchr_ia32 +#endif + +#include "../../i586/strchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S new file mode 100644 index 0000000000..cd26058671 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -0,0 +1,804 @@ +/* strcmp with SSE4.2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strncmp_sse4_2 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 +# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM) +# define REM %ebp +#elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strcasecmp_l_sse4_2 +# endif +# ifdef PIC +# define STR1 12 +# else +# define STR1 8 +# endif +# define STR2 STR1+4 +# define LOCALE 12 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (%edi); POP (%ebx); ret; \ + .p2align 4; CFI_PUSH (%ebx); CFI_PUSH (%edi) +# else +# define RETURN POP (%edi); ret; .p2align 4; CFI_PUSH (%edi) +# endif +# define NONASCII __strcasecmp_nonascii +#elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strncasecmp_l_sse4_2 +# endif +# ifdef PIC +# define STR1 16 +# else +# define STR1 12 +# endif +# define STR2 STR1+4 +# define CNT STR2+4 +# define LOCALE 16 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (%edi); POP (REM); POP (%ebx); ret; \ + .p2align 4; \ + CFI_PUSH (%ebx); CFI_PUSH (REM); CFI_PUSH (%edi) +# else +# define RETURN POP (%edi); POP (REM); ret; \ + .p2align 4; CFI_PUSH (REM); CFI_PUSH (%edi) +# endif +# define REM %ebp +# define NONASCII __strncasecmp_nonascii +#else +# ifndef STRCMP +# define STRCMP __strcmp_sse4_2 +# endif +# define STR1 4 +# define STR2 STR1+4 +# define RETURN ret; .p2align 4 +#endif + + .section .text.sse4.2,"ax",@progbits + +#ifdef USE_AS_STRCASECMP_L +ENTRY (__strcasecmp_sse4_2) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strcasecmp_nonascii +# else + jne __strcasecmp_nonascii + jmp L(ascii) +# endif +END (__strcasecmp_sse4_2) +#endif + +#ifdef USE_AS_STRNCASECMP_L +ENTRY (__strncasecmp_sse4_2) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strncasecmp_nonascii +# else + jne __strncasecmp_nonascii + jmp L(ascii) +# endif +END (__strncasecmp_sse4_2) +#endif + + ENTRY (STRCMP) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movl LOCALE(%esp), %eax +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) + jne NONASCII + +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) +# endif +L(ascii): + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.Lbelowupper: + .quad 0x4040404040404040 + .quad 0x4040404040404040 +.Ltopupper: + .quad 0x5b5b5b5b5b5b5b5b + .quad 0x5b5b5b5b5b5b5b5b +.Ltouppermask: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + +# ifdef PIC +# define UCLOW_reg .Lbelowupper@GOTOFF(%ebx) +# define UCHIGH_reg .Ltopupper@GOTOFF(%ebx) +# define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx) +# else +# define UCLOW_reg .Lbelowupper +# define UCHIGH_reg .Ltopupper +# define LCQWORD_reg .Ltouppermask +# endif +#endif + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + PUSH (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + PUSH (%edi) +#endif + mov STR1(%esp), %edx + mov STR2(%esp), %eax +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + movl CNT(%esp), REM + test REM, REM + je L(eq) +#endif + mov %dx, %cx + and $0xfff, %cx + cmp $0xff0, %cx + ja L(first4bytes) + movdqu (%edx), %xmm2 + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(first4bytes) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa reg1, %xmm3; \ + movdqa UCHIGH_reg, %xmm4; \ + movdqa reg2, %xmm5; \ + movdqa UCHIGH_reg, %xmm6; \ + pcmpgtb UCLOW_reg, %xmm3; \ + pcmpgtb reg1, %xmm4; \ + pcmpgtb UCLOW_reg, %xmm5; \ + pcmpgtb reg2, %xmm6; \ + pand %xmm4, %xmm3; \ + pand %xmm6, %xmm5; \ + pand LCQWORD_reg, %xmm3; \ + pand LCQWORD_reg, %xmm5; \ + por %xmm3, reg1; \ + por %xmm5, reg2 + + movdqu (%eax), %xmm1 + TOLOWER (%xmm2, %xmm1) + movd %xmm2, %ecx + movd %xmm1, %edi + movdqa %xmm2, %xmm3 + movdqa %xmm1, %xmm4 + cmpl %edi, %ecx +#else +# define TOLOWER(reg1, reg) + + movd %xmm2, %ecx + cmp (%eax), %ecx +#endif + jne L(less4bytes) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + movdqu (%eax), %xmm1 +#endif + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm0 + ptest %xmm1, %xmm0 + jnc L(less16bytes) + pcmpeqb %xmm0, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, REM + jbe L(eq) +#endif + add $16, %edx + add $16, %eax +L(first4bytes): + movzbl (%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl (%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, (%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + je L(eq) +#endif + + movzbl 1(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 1(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 1(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + je L(eq) +#endif + movzbl 2(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 2(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 2(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + je L(eq) +#endif + movzbl 3(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 3(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 3(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + je L(eq) +#endif + movzbl 4(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 4(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 4(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + je L(eq) +#endif + movzbl 5(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 5(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 5(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + je L(eq) +#endif + movzbl 6(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 6(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 6(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + je L(eq) +#endif + movzbl 7(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 7(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 7(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $8, REM + je L(eq) +#endif + add $8, %eax + add $8, %edx + +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + PUSH (%edi) +#endif + PUSH (%esi) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cfi_remember_state +#endif + mov %edx, %edi + mov %eax, %esi + xorl %eax, %eax +L(check_offset): + movl %edi, %edx + movl %esi, %ecx + andl $0xfff, %edx + andl $0xfff, %ecx + cmpl %edx, %ecx + cmovl %edx, %ecx + lea -0xff0(%ecx), %edx + sub %edx, %edi + sub %edx, %esi + testl %edx, %edx + jg L(crosspage) +L(loop): + movdqu (%esi,%edx), %xmm2 + movdqu (%edi,%edx), %xmm1 + TOLOWER (%xmm2, %xmm1) + pcmpistri $0x1a, %xmm2, %xmm1 + jbe L(end) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, REM + jbe L(more16byteseq) +#endif + + add $16, %edx + jle L(loop) +L(crosspage): + movzbl (%edi,%edx), %eax + movzbl (%esi,%edx), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx +# endif +#endif + subl %ecx, %eax + jne L(ret) + testl %ecx, %ecx + je L(ret) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $1, REM + jbe L(more16byteseq) +#endif + inc %edx + cmp $15, %edx + jle L(crosspage) + add %edx, %edi + add %edx, %esi + jmp L(check_offset) + + .p2align 4 +L(end): + jnc L(ret) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %ecx, REM + jbe L(more16byteseq) +#endif + lea (%ecx,%edx), %ecx + movzbl (%edi,%ecx), %eax + movzbl (%esi,%ecx), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx +# endif +#endif + subl %ecx, %eax +L(ret): + POP (%esi) + POP (%edi) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + POP (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + POP (%ebx) +# endif +#endif + ret + + .p2align 4 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cfi_restore_state +L(more16byteseq): + POP (%esi) +# ifdef USE_AS_STRNCMP + POP (%edi) +# endif +#endif +L(eq): + xorl %eax, %eax + RETURN + +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): + RETURN + +L(less16bytes): + add $0xfefefeff, %ecx + jnc L(less4bytes) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movd %xmm3, %edi + xor %edi, %ecx +#else + xor (%edx), %ecx +#endif + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(less4bytes) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + jbe L(eq) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + psrldq $4, %xmm3 + psrldq $4, %xmm4 + movd %xmm3, %ecx + movd %xmm4, %edi + cmp %edi, %ecx + mov %ecx, %edi +#else + mov 4(%edx), %ecx + cmp 4(%eax), %ecx +#endif + jne L(more4bytes) + add $0xfefefeff, %ecx + jnc L(more4bytes) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + xor %edi, %ecx +#else + xor 4(%edx), %ecx +#endif + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(more4bytes) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $8, REM + jbe L(eq) +#endif + + add $8, %edx + add $8, %eax +L(less4bytes): + + movzbl (%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl (%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, (%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + je L(eq) +#endif + movzbl 1(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 1(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 1(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + je L(eq) +#endif + + movzbl 2(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 2(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 2(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + je L(eq) +#endif + movzbl 3(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 3(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 3(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +L(more4bytes): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + je L(eq) +#endif + movzbl 4(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 4(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 4(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + je L(eq) +#endif + movzbl 5(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 5(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 5(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + je L(eq) +#endif + movzbl 6(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 6(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 6(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + je L(eq) +#endif + movzbl 7(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 7(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 7(%edx) +#endif + jne L(neq) + jmp L(eq) + +END (STRCMP) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S new file mode 100644 index 0000000000..b25cc3e068 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S @@ -0,0 +1,2810 @@ +/* strcmp with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strncmp_ssse3 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 +# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM) +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + mov $16, %esi; \ + sub %ecx, %esi; \ + cmp %esi, REM; \ + jbe L(more8byteseq); \ + sub %esi, REM +# define FLAGS %ebx +# define REM %ebp +#elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strcasecmp_l_ssse3 +# endif +# ifdef PIC +# define STR1 8 +# else +# define STR1 4 +# endif +# define STR2 STR1+4 +# define LOCALE 12 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (%ebx); ret; .p2align 4; CFI_PUSH (%ebx) +# else +# define RETURN ret; .p2align 4 +# endif +# define UPDATE_STRNCMP_COUNTER +# define FLAGS (%esp) +# define NONASCII __strcasecmp_nonascii +#elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strncasecmp_l_ssse3 +# endif +# ifdef PIC +# define STR1 12 +# else +# define STR1 8 +# endif +# define STR2 STR1+4 +# define CNT STR2+4 +# define LOCALE 16 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (REM); POP (%ebx); ret; \ + .p2align 4; CFI_PUSH (%ebx); CFI_PUSH (REM) +# else +# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM) +# endif +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + mov $16, %esi; \ + sub %ecx, %esi; \ + cmp %esi, REM; \ + jbe L(more8byteseq); \ + sub %esi, REM +# define FLAGS (%esp) +# define REM %ebp +# define NONASCII __strncasecmp_nonascii +#else +# ifndef STRCMP +# define STRCMP __strcmp_ssse3 +# endif +# define STR1 4 +# define STR2 STR1+4 +# define RETURN ret; .p2align 4 +# define UPDATE_STRNCMP_COUNTER +# define FLAGS %ebx +#endif + + .section .text.ssse3,"ax",@progbits + +#ifdef USE_AS_STRCASECMP_L +ENTRY (__strcasecmp_ssse3) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strcasecmp_nonascii +# else + jne __strcasecmp_nonascii + jmp L(ascii) +# endif +END (__strcasecmp_ssse3) +#endif + +#ifdef USE_AS_STRNCASECMP_L +ENTRY (__strncasecmp_ssse3) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strncasecmp_nonascii +# else + jne __strncasecmp_nonascii + jmp L(ascii) +# endif +END (__strncasecmp_ssse3) +#endif + +ENTRY (STRCMP) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movl LOCALE(%esp), %eax +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) + jne NONASCII + +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) +# endif +L(ascii): + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.Lbelowupper: + .quad 0x4040404040404040 + .quad 0x4040404040404040 +.Ltopupper: + .quad 0x5b5b5b5b5b5b5b5b + .quad 0x5b5b5b5b5b5b5b5b +.Ltouppermask: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + +# ifdef PIC +# define UCLOW_reg .Lbelowupper@GOTOFF(%ebx) +# define UCHIGH_reg .Ltopupper@GOTOFF(%ebx) +# define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx) +# else +# define UCLOW_reg .Lbelowupper +# define UCHIGH_reg .Ltopupper +# define LCQWORD_reg .Ltouppermask +# endif +#endif + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + PUSH (REM) +#endif + + movl STR1(%esp), %edx + movl STR2(%esp), %eax +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + movl CNT(%esp), REM + cmp $16, REM + jb L(less16bytes_sncmp) +#elif !defined USE_AS_STRCASECMP_L + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + add $8, %edx + add $8, %eax +#endif + movl %edx, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(crosspage) + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(crosspage) + pxor %xmm0, %xmm0 + movlpd (%eax), %xmm1 + movlpd (%edx), %xmm2 + movhpd 8(%eax), %xmm1 + movhpd 8(%edx), %xmm2 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa reg1, %xmm5; \ + movdqa reg2, %xmm7; \ + movdqa UCHIGH_reg, %xmm6; \ + pcmpgtb UCLOW_reg, %xmm5; \ + pcmpgtb UCLOW_reg, %xmm7; \ + pcmpgtb reg1, %xmm6; \ + pand %xmm6, %xmm5; \ + movdqa UCHIGH_reg, %xmm6; \ + pcmpgtb reg2, %xmm6; \ + pand %xmm6, %xmm7; \ + pand LCQWORD_reg, %xmm5; \ + por %xmm5, reg1; \ + pand LCQWORD_reg, %xmm7; \ + por %xmm7, reg2 + TOLOWER (%xmm1, %xmm2) +#else +# define TOLOWER(reg1, reg2) +#endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %ecx + sub $0xffff, %ecx + jnz L(less16bytes) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(eq) +#endif + add $16, %eax + add $16, %edx + +L(crosspage): + +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + PUSH (FLAGS) +#endif + PUSH (%edi) + PUSH (%esi) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + pushl $0 + cfi_adjust_cfa_offset (4) +#endif +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cfi_remember_state +#endif + + movl %edx, %edi + movl %eax, %ecx + and $0xf, %ecx + and $0xf, %edi + xor %ecx, %eax + xor %edi, %edx +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + xor FLAGS, FLAGS +#endif + cmp %edi, %ecx + je L(ashr_0) + ja L(bigger) + orl $0x20, FLAGS + xchg %edx, %eax + xchg %ecx, %edi +L(bigger): + lea 15(%edi), %edi + sub %ecx, %edi + cmp $8, %edi + jle L(ashr_less_8) + cmp $14, %edi + je L(ashr_15) + cmp $13, %edi + je L(ashr_14) + cmp $12, %edi + je L(ashr_13) + cmp $11, %edi + je L(ashr_12) + cmp $10, %edi + je L(ashr_11) + cmp $9, %edi + je L(ashr_10) +L(ashr_less_8): + je L(ashr_9) + cmp $7, %edi + je L(ashr_8) + cmp $6, %edi + je L(ashr_7) + cmp $5, %edi + je L(ashr_6) + cmp $4, %edi + je L(ashr_5) + cmp $3, %edi + je L(ashr_4) + cmp $2, %edi + je L(ashr_3) + cmp $1, %edi + je L(ashr_2) + cmp $0, %edi + je L(ashr_1) + +/* + * The following cases will be handled by ashr_0 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +L(ashr_0): + mov $0xffff, %esi + movdqa (%eax), %xmm1 + pxor %xmm0, %xmm0 + pcmpeqb %xmm1, %xmm0 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movdqa (%edx), %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm2, %xmm1 +#else + pcmpeqb (%edx), %xmm1 +#endif + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + mov %ecx, %edi + jne L(less32bytes) + UPDATE_STRNCMP_COUNTER + movl $0x10, FLAGS + mov $0x10, %ecx + pxor %xmm0, %xmm0 + .p2align 4 +L(loop_ashr_0): + movdqa (%eax, %ecx), %xmm1 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movdqa (%edx, %ecx), %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 +#else + pcmpeqb %xmm1, %xmm0 + pcmpeqb (%edx, %ecx), %xmm1 +#endif + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + jmp L(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +L(ashr_1): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $15, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -15(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $1, FLAGS + lea 1(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_1): + add $16, %edi + jg L(nibble_ashr_1) + +L(gobble_ashr_1): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $1, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_1) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $1, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_1) + + .p2align 4 +L(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfffe, %esi + jnz L(ashr_1_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $15, REM + jbe L(ashr_1_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_1) + + .p2align 4 +L(ashr_1_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_2 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +L(ashr_2): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -14(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $2, FLAGS + lea 2(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_2): + add $16, %edi + jg L(nibble_ashr_2) + +L(gobble_ashr_2): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_2) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_2) + + .p2align 4 +L(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfffc, %esi + jnz L(ashr_2_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $14, REM + jbe L(ashr_2_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_2) + + .p2align 4 +L(ashr_2_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_3 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +L(ashr_3): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -13(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $3, FLAGS + lea 3(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_3): + add $16, %edi + jg L(nibble_ashr_3) + +L(gobble_ashr_3): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_3) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_3) + + .p2align 4 +L(nibble_ashr_3): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfff8, %esi + jnz L(ashr_3_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $13, REM + jbe L(ashr_3_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_3) + + .p2align 4 +L(ashr_3_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $3, %xmm0 + psrldq $3, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_4 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +L(ashr_4): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -12(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $4, FLAGS + lea 4(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_4): + add $16, %edi + jg L(nibble_ashr_4) + +L(gobble_ashr_4): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_4) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_4) + + .p2align 4 +L(nibble_ashr_4): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfff0, %esi + jnz L(ashr_4_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $12, REM + jbe L(ashr_4_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_4) + + .p2align 4 +L(ashr_4_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_5 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(11~15) n -11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +L(ashr_5): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -11(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $5, FLAGS + lea 5(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_5): + add $16, %edi + jg L(nibble_ashr_5) + +L(gobble_ashr_5): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_5) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_5) + + .p2align 4 +L(nibble_ashr_5): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xffe0, %esi + jnz L(ashr_5_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $11, REM + jbe L(ashr_5_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_5) + + .p2align 4 +L(ashr_5_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $5, %xmm0 + psrldq $5, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_6 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(10~15) n -10 5(15 +(n-10) - n) ashr_6 + */ + + .p2align 4 +L(ashr_6): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -10(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $6, FLAGS + lea 6(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_6): + add $16, %edi + jg L(nibble_ashr_6) + +L(gobble_ashr_6): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_6) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_6) + + .p2align 4 +L(nibble_ashr_6): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xffc0, %esi + jnz L(ashr_6_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $10, REM + jbe L(ashr_6_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_6) + + .p2align 4 +L(ashr_6_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $6, %xmm0 + psrldq $6, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_7 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n-9) - n) ashr_7 + */ + + .p2align 4 +L(ashr_7): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -9(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $7, FLAGS + lea 8(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_7): + add $16, %edi + jg L(nibble_ashr_7) + +L(gobble_ashr_7): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_7) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_7) + + .p2align 4 +L(nibble_ashr_7): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xff80, %esi + jnz L(ashr_7_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $9, REM + jbe L(ashr_7_exittail) +#endif + pxor %xmm0, %xmm0 + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_7) + + .p2align 4 +L(ashr_7_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $7, %xmm0 + psrldq $7, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_8 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n-8) - n) ashr_8 + */ + .p2align 4 +L(ashr_8): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -8(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $8, FLAGS + lea 8(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_8): + add $16, %edi + jg L(nibble_ashr_8) + +L(gobble_ashr_8): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_8) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_8) + + .p2align 4 +L(nibble_ashr_8): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xff00, %esi + jnz L(ashr_8_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $8, REM + jbe L(ashr_8_exittail) +#endif + pxor %xmm0, %xmm0 + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_8) + + .p2align 4 +L(ashr_8_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $8, %xmm0 + psrldq $8, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_9 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n-7) - n) ashr_9 + */ + .p2align 4 +L(ashr_9): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -7(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $9, FLAGS + lea 9(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_9): + add $16, %edi + jg L(nibble_ashr_9) + +L(gobble_ashr_9): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_9) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_9) + + .p2align 4 +L(nibble_ashr_9): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfe00, %esi + jnz L(ashr_9_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + jbe L(ashr_9_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_9) + + .p2align 4 +L(ashr_9_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $9, %xmm0 + psrldq $9, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_10 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n-6) - n) ashr_10 + */ + .p2align 4 +L(ashr_10): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -6(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $10, FLAGS + lea 10(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_10): + add $16, %edi + jg L(nibble_ashr_10) + +L(gobble_ashr_10): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_10) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_10) + + .p2align 4 +L(nibble_ashr_10): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfc00, %esi + jnz L(ashr_10_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + jbe L(ashr_10_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_10) + + .p2align 4 +L(ashr_10_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $10, %xmm0 + psrldq $10, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_11 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n-5) - n) ashr_11 + */ + .p2align 4 +L(ashr_11): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -5(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $11, FLAGS + lea 11(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_11): + add $16, %edi + jg L(nibble_ashr_11) + +L(gobble_ashr_11): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_11) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_11) + + .p2align 4 +L(nibble_ashr_11): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xf800, %esi + jnz L(ashr_11_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + jbe L(ashr_11_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_11) + + .p2align 4 +L(ashr_11_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $11, %xmm0 + psrldq $11, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_12 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n-4) - n) ashr_12 + */ + .p2align 4 +L(ashr_12): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -4(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $12, FLAGS + lea 12(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_12): + add $16, %edi + jg L(nibble_ashr_12) + +L(gobble_ashr_12): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_12) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_12) + + .p2align 4 +L(nibble_ashr_12): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xf000, %esi + jnz L(ashr_12_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + jbe L(ashr_12_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_12) + + .p2align 4 +L(ashr_12_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $12, %xmm0 + psrldq $12, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_13 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n-3) - n) ashr_13 + */ + .p2align 4 +L(ashr_13): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -3(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $13, FLAGS + lea 13(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_13): + add $16, %edi + jg L(nibble_ashr_13) + +L(gobble_ashr_13): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_13) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_13) + + .p2align 4 +L(nibble_ashr_13): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xe000, %esi + jnz L(ashr_13_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + jbe L(ashr_13_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_13) + + .p2align 4 +L(ashr_13_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $13, %xmm0 + psrldq $13, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n-2) - n) ashr_14 + */ + .p2align 4 +L(ashr_14): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -2(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $14, FLAGS + lea 14(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_14): + add $16, %edi + jg L(nibble_ashr_14) + +L(gobble_ashr_14): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_14) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_14) + + .p2align 4 +L(nibble_ashr_14): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xc000, %esi + jnz L(ashr_14_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + jbe L(ashr_14_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_14) + + .p2align 4 +L(ashr_14_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $14, %xmm0 + psrldq $14, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n-1) - n) ashr_15 + */ + + .p2align 4 +L(ashr_15): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -1(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $15, FLAGS + lea 15(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_15): + add $16, %edi + jg L(nibble_ashr_15) + +L(gobble_ashr_15): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_15) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_15) + + .p2align 4 +L(nibble_ashr_15): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0x8000, %esi + jnz L(ashr_15_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + jbe L(ashr_15_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_15) + + .p2align 4 +L(ashr_15_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $15, %xmm0 + psrldq $15, %xmm3 + jmp L(aftertail) + + .p2align 4 +L(aftertail): + TOLOWER (%xmm1, %xmm3) + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + not %esi +L(exit): + mov FLAGS, %edi + and $0x1f, %edi + lea -16(%edi, %ecx), %edi +L(less32bytes): + add %edi, %edx + add %ecx, %eax + testl $0x20, FLAGS + jz L(ret2) + xchg %eax, %edx + + .p2align 4 +L(ret2): + mov %esi, %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + addl $4, %esp + cfi_adjust_cfa_offset (-4) +#endif + POP (%esi) + POP (%edi) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + POP (FLAGS) +#endif +L(less16bytes): + test %cl, %cl + jz L(2next_8_bytes) + + test $0x01, %cl + jnz L(Byte0) + + test $0x02, %cl + jnz L(Byte1) + + test $0x04, %cl + jnz L(Byte2) + + test $0x08, %cl + jnz L(Byte3) + + test $0x10, %cl + jnz L(Byte4) + + test $0x20, %cl + jnz L(Byte5) + + test $0x40, %cl + jnz L(Byte6) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + jbe L(eq) +#endif + + movzx 7(%eax), %ecx + movzx 7(%edx), %eax +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte0): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $0, REM + jbe L(eq) +#endif + movzx (%eax), %ecx + movzx (%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte1): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + jbe L(eq) +#endif + movzx 1(%eax), %ecx + movzx 1(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte2): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + jbe L(eq) +#endif + movzx 2(%eax), %ecx + movzx 2(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte3): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + jbe L(eq) +#endif + movzx 3(%eax), %ecx + movzx 3(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte4): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + jbe L(eq) +#endif + movzx 4(%eax), %ecx + movzx 4(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte5): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + jbe L(eq) +#endif + movzx 5(%eax), %ecx + movzx 5(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte6): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + jbe L(eq) +#endif + movzx 6(%eax), %ecx + movzx 6(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(2next_8_bytes): + add $8, %eax + add $8, %edx +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $8, REM + lea -8(REM), REM + jbe L(eq) +#endif + + test $0x01, %ch + jnz L(Byte0) + + test $0x02, %ch + jnz L(Byte1) + + test $0x04, %ch + jnz L(Byte2) + + test $0x08, %ch + jnz L(Byte3) + + test $0x10, %ch + jnz L(Byte4) + + test $0x20, %ch + jnz L(Byte5) + + test $0x40, %ch + jnz L(Byte6) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + jbe L(eq) +#endif + movzx 7(%eax), %ecx + movzx 7(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +#ifdef USE_AS_STRNCMP +L(neq_sncmp): +#endif +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + addl $4, %esp + cfi_adjust_cfa_offset (-4) +#endif +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + POP (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + POP (%ebx) +# endif +#endif + ret + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + .p2align 4 + cfi_restore_state +L(more8byteseq): + +# ifdef USE_AS_STRNCASECMP_L + addl $4, %esp + cfi_adjust_cfa_offset (-4) +# endif + POP (%esi) + POP (%edi) +# ifdef USE_AS_STRNCMP + POP (FLAGS) +# endif +#endif + +#ifdef USE_AS_STRNCMP +L(eq_sncmp): +#endif +L(eq): + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + POP (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + POP (%ebx) +# endif +#endif + xorl %eax, %eax + ret + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + .p2align 4 +# if defined USE_AS_STRNCASECMP_L && defined PIC + CFI_PUSH (%ebx) +# endif + CFI_PUSH (REM) +L(less16bytes_sncmp): +# ifdef USE_AS_STRNCASECMP_L + PUSH (%esi) +# endif + test REM, REM + jz L(eq_sncmp) + + movzbl (%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl (%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, (%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $1, REM + je L(eq_sncmp) + + movzbl 1(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 1(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 1(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $2, REM + je L(eq_sncmp) + + movzbl 2(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 2(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 2(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $3, REM + je L(eq_sncmp) + + movzbl 3(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 3(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 3(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $4, REM + je L(eq_sncmp) + + movzbl 4(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 4(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 4(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $5, REM + je L(eq_sncmp) + + movzbl 5(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 5(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 5(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $6, REM + je L(eq_sncmp) + + movzbl 6(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 6(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 6(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $7, REM + je L(eq_sncmp) + + movzbl 7(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 7(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 7(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + + cmp $8, REM + je L(eq_sncmp) + + movzbl 8(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 8(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 8(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $9, REM + je L(eq_sncmp) + + movzbl 9(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 9(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 9(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $10, REM + je L(eq_sncmp) + + movzbl 10(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 10(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 10(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $11, REM + je L(eq_sncmp) + + movzbl 11(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 11(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 11(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + + cmp $12, REM + je L(eq_sncmp) + + movzbl 12(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 12(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 12(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $13, REM + je L(eq_sncmp) + + movzbl 13(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 13(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 13(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $14, REM + je L(eq_sncmp) + + movzbl 14(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 14(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 14(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $15, REM + je L(eq_sncmp) + + movzbl 15(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 15(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 15(%edx) +# endif + jne L(neq_sncmp) + +# ifdef USE_AS_STRNCASECMP_L +L(eq_sncmp): + POP (%esi) +# endif + POP (REM) +# if defined USE_AS_STRNCASECMP_L && defined PIC + POP (%ebx) +# endif + xor %eax, %eax + ret + +# ifdef USE_AS_STRNCASECMP_L + .p2align 4 +# ifdef PIC + CFI_PUSH (%ebx) +# endif + CFI_PUSH (REM) + CFI_PUSH (%esi) +L(neq_sncmp): + mov $1, %eax + mov $-1, %edx + cmovna %edx, %eax + POP (%esi) + POP (REM) +# ifdef PIC + POP (%ebx) +# endif + ret +# endif +#endif + +END (STRCMP) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S new file mode 100644 index 0000000000..56de25a4b7 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S @@ -0,0 +1,95 @@ +/* Multiple versions of strcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifdef USE_AS_STRNCMP +# define STRCMP strncmp +# define __GI_STRCMP __GI_strncmp +# define __STRCMP_IA32 __strncmp_ia32 +# define __STRCMP_SSSE3 __strncmp_ssse3 +# define __STRCMP_SSE4_2 __strncmp_sse4_2 +#elif defined USE_AS_STRCASECMP_L +# define STRCMP __strcasecmp_l +# define __GI_STRCMP __GI_strcasecmp_l +# define __STRCMP_IA32 __strcasecmp_l_ia32 +# define __STRCMP_SSSE3 __strcasecmp_l_ssse3 +# define __STRCMP_SSE4_2 __strcasecmp_l_sse4_2 +#elif defined USE_AS_STRNCASECMP_L +# define STRCMP __strncasecmp_l +# define __GI_STRCMP __GI_strncasecmp_l +# define __STRCMP_IA32 __strncasecmp_l_ia32 +# define __STRCMP_SSSE3 __strncasecmp_l_ssse3 +# define __STRCMP_SSE4_2 __strncasecmp_l_sse4_2 +#else +# define STRCMP strcmp +# define __GI_STRCMP __GI_strcmp +# define __STRCMP_IA32 __strcmp_ia32 +# define __STRCMP_SSSE3 __strcmp_ssse3 +# define __STRCMP_SSE4_2 __strcmp_sse4_2 +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncmp in static library since we + need strncmp before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc) + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__STRCMP_IA32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2) +2: ret +END(STRCMP) + +# undef ENTRY +# define ENTRY(name) \ + .type __STRCMP_IA32, @function; \ + .p2align 4; \ + .globl __STRCMP_IA32; \ + .hidden __STRCMP_IA32; \ + __STRCMP_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32 +# endif +#endif + +#if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L \ + && !defined USE_AS_STRNCASECMP_L +# include "../strcmp.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S new file mode 100644 index 0000000000..ed627a5f62 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S @@ -0,0 +1,2250 @@ +/* strcpy with SSE2 and unaligned load + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# include <sysdep.h> + + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCPY +# define STRCPY __strcpy_sse2 +# endif + +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# ifdef USE_AS_STRNCPY +# define PARMS 16 +# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi) +# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; \ + CFI_PUSH(%ebx); CFI_PUSH(%esi); CFI_PUSH(%edi); + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. + INDEX is a register contains the index into the jump table. + SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + SETUP_PIC_REG(cx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjusted ECX. Go. */ \ + jmp *%ecx +# else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + +.text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edi + mov STR2(%esp), %esi + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) + + mov %esi, %ecx +# ifndef USE_AS_STPCPY + mov %edi, %eax /* save result */ +# endif + and $15, %ecx + jz L(SourceStringAlignmentZero) + + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%esi), %xmm1 + add %ecx, %ebx + pmovmskb %xmm1, %edx + shr %cl, %edx +# ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# else + cmp $33, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%edi) + + sub %ecx, %edi + +/* If source address alignment != destination address alignment */ + .p2align 4 +L(Unalign16Both): + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%edi, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%edi, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm1) + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) + + movdqu %xmm3, (%edi, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %edi + lea 128(%ebx, %edx), %ebx + +L(Unaligned64Loop): + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) + test %edx, %edx + jnz L(Unaligned64Leave) +L(Unaligned64Loop_start): + add $64, %edi + add $64, %esi + movdqu %xmm4, -64(%edi) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%edi) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%edi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%edi) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) + test %edx, %edx + jz L(Unaligned64Loop_start) +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) + movdqu %xmm6, 32(%edi) +# ifdef USE_AS_STPCPY + lea 48(%edi, %edx), %eax +# endif + movdqu %xmm7, 48(%edi) + add $15, %ebx + sub %edx, %ebx + lea 49(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + +/* If source address alignment == destination address alignment */ + +L(SourceStringAlignmentZero): + pxor %xmm0, %xmm0 + movdqa (%esi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb 16(%esi), %xmm0 + movdqu %xmm1, (%edi) + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# else + cmp $33, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + jmp L(Unalign16Both) + +/*-----------------End of main part---------------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16BytesTail): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %edi + sub $16, %ebx +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + sub %ecx, %ebx + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx +# ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +# endif + movdqu %xmm4, (%edi) + add $63, %ebx + sub %edx, %ebx + lea 1(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi, %edx), %eax +# endif + movdqu %xmm5, 16(%edi) + add $47, %ebx + sub %edx, %ebx + lea 17(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) +# ifdef USE_AS_STPCPY + lea 32(%edi, %edx), %eax +# endif + movdqu %xmm6, 32(%edi) + add $31, %ebx + sub %edx, %ebx + lea 33(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %edi + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(Exit0): +# ifdef USE_AS_STPCPY + mov %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit1): + movb %dh, (%edi) +# ifdef USE_AS_STPCPY + lea (%edi), %eax +# endif + sub $1, %ebx + lea 1(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit2): + movw (%esi), %dx + movw %dx, (%edi) +# ifdef USE_AS_STPCPY + lea 1(%edi), %eax +# endif + sub $2, %ebx + lea 2(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit3): + movw (%esi), %cx + movw %cx, (%edi) + movb %dh, 2(%edi) +# ifdef USE_AS_STPCPY + lea 2(%edi), %eax +# endif + sub $3, %ebx + lea 3(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit4): + movl (%esi), %edx + movl %edx, (%edi) +# ifdef USE_AS_STPCPY + lea 3(%edi), %eax +# endif + sub $4, %ebx + lea 4(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit5): + movl (%esi), %ecx + movb %dh, 4(%edi) + movl %ecx, (%edi) +# ifdef USE_AS_STPCPY + lea 4(%edi), %eax +# endif + sub $5, %ebx + lea 5(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%edi) + movw %dx, 4(%edi) +# ifdef USE_AS_STPCPY + lea 5(%edi), %eax +# endif + sub $6, %ebx + lea 6(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%edi) + movl %edx, 3(%edi) +# ifdef USE_AS_STPCPY + lea 6(%edi), %eax +# endif + sub $7, %ebx + lea 7(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 7(%edi), %eax +# endif + sub $8, %ebx + lea 8(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit9): + movlpd (%esi), %xmm0 + movb %dh, 8(%edi) + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 8(%edi), %eax +# endif + sub $9, %ebx + lea 9(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 9(%edi), %eax +# endif + sub $10, %ebx + lea 10(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) +# ifdef USE_AS_STPCPY + lea 10(%edi), %eax +# endif + sub $11, %ebx + lea 11(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 11(%edi), %eax +# endif + sub $12, %ebx + lea 12(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 5(%edi) +# ifdef USE_AS_STPCPY + lea 12(%edi), %eax +# endif + sub $13, %ebx + lea 13(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 6(%edi) +# ifdef USE_AS_STPCPY + lea 13(%edi), %eax +# endif + sub $14, %ebx + lea 14(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 7(%edi) +# ifdef USE_AS_STPCPY + lea 14(%edi), %eax +# endif + sub $15, %ebx + lea 15(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 15(%edi), %eax +# endif + sub $16, %ebx + lea 16(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit17): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) + movb %dh, 16(%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi), %eax +# endif + sub $17, %ebx + lea 17(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%edi) + movw %cx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 17(%edi), %eax +# endif + sub $18, %ebx + lea 18(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 15(%edi) +# ifdef USE_AS_STPCPY + lea 18(%edi), %eax +# endif + sub $19, %ebx + lea 19(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 19(%edi), %eax +# endif + sub $20, %ebx + lea 20(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) + movb %dh, 20(%edi) +# ifdef USE_AS_STPCPY + lea 20(%edi), %eax +# endif + sub $21, %ebx + lea 21(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 14(%edi) +# ifdef USE_AS_STPCPY + lea 21(%edi), %eax +# endif + sub $22, %ebx + lea 22(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 15(%edi) +# ifdef USE_AS_STPCPY + lea 22(%edi), %eax +# endif + sub $23, %ebx + lea 23(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 23(%edi), %eax +# endif + sub $24, %ebx + lea 24(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movb %dh, 24(%edi) +# ifdef USE_AS_STPCPY + lea 24(%edi), %eax +# endif + sub $25, %ebx + lea 25(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movw %cx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 25(%edi), %eax +# endif + sub $26, %ebx + lea 26(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 23(%edi) +# ifdef USE_AS_STPCPY + lea 26(%edi), %eax +# endif + sub $27, %ebx + lea 27(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 27(%edi), %eax +# endif + sub $28, %ebx + lea 28(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 13(%edi) +# ifdef USE_AS_STPCPY + lea 28(%edi), %eax +# endif + sub $29, %ebx + lea 29(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 14(%edi) +# ifdef USE_AS_STPCPY + lea 29(%edi), %eax +# endif + sub $30, %ebx + lea 30(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + + .p2align 4 +L(Exit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 15(%edi) +# ifdef USE_AS_STPCPY + lea 30(%edi), %eax +# endif + sub $31, %ebx + lea 31(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 31(%edi), %eax +# endif + sub $32, %ebx + lea 32(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(StrncpyExit1): + movb (%esi), %dl + movb %dl, (%edi) +# ifdef USE_AS_STPCPY + lea 1(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit2): + movw (%esi), %dx + movw %dx, (%edi) +# ifdef USE_AS_STPCPY + lea 2(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit3): + movw (%esi), %cx + movb 2(%esi), %dl + movw %cx, (%edi) + movb %dl, 2(%edi) +# ifdef USE_AS_STPCPY + lea 3(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit4): + movl (%esi), %edx + movl %edx, (%edi) +# ifdef USE_AS_STPCPY + lea 4(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit5): + movl (%esi), %ecx + movb 4(%esi), %dl + movl %ecx, (%edi) + movb %dl, 4(%edi) +# ifdef USE_AS_STPCPY + lea 5(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%edi) + movw %dx, 4(%edi) +# ifdef USE_AS_STPCPY + lea 6(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%edi) + movl %edx, 3(%edi) +# ifdef USE_AS_STPCPY + lea 7(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 8(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit9): + movlpd (%esi), %xmm0 + movb 8(%esi), %dl + movlpd %xmm0, (%edi) + movb %dl, 8(%edi) +# ifdef USE_AS_STPCPY + lea 9(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 10(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) +# ifdef USE_AS_STPCPY + lea 11(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 12(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 5(%edi) +# ifdef USE_AS_STPCPY + lea 13(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 6(%edi) +# ifdef USE_AS_STPCPY + lea 14(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 7(%edi) +# ifdef USE_AS_STPCPY + lea 15(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit17): + movdqu (%esi), %xmm0 + movb 16(%esi), %cl + movdqu %xmm0, (%edi) + movb %cl, 16(%edi) +# ifdef USE_AS_STPCPY + lea 17(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%edi) + movw %cx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 18(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 15(%edi) +# ifdef USE_AS_STPCPY + lea 19(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 20(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movb 20(%esi), %dl + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) + movb %dl, 20(%edi) +# ifdef USE_AS_STPCPY + lea 21(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 14(%edi) +# ifdef USE_AS_STPCPY + lea 22(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 15(%edi) +# ifdef USE_AS_STPCPY + lea 23(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 24(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movb 24(%esi), %cl + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movb %cl, 24(%edi) +# ifdef USE_AS_STPCPY + lea 25(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movw %cx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 26(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 23(%edi) +# ifdef USE_AS_STPCPY + lea 27(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 28(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 13(%edi) +# ifdef USE_AS_STPCPY + lea 29(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 14(%edi) +# ifdef USE_AS_STPCPY + lea 30(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 15(%edi) +# ifdef USE_AS_STPCPY + lea 31(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 32(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit33): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movb 32(%esi), %cl + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) + movb %cl, 32(%edi) + RETURN + + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + movb %dl, (%edi) + RETURN + + .p2align 4 +L(Fill2): + movw %dx, (%edi) + RETURN + + .p2align 4 +L(Fill3): + movl %edx, -1(%edi) + RETURN + + .p2align 4 +L(Fill4): + movl %edx, (%edi) + RETURN + + .p2align 4 +L(Fill5): + movl %edx, (%edi) + movb %dl, 4(%edi) + RETURN + + .p2align 4 +L(Fill6): + movl %edx, (%edi) + movw %dx, 4(%edi) + RETURN + + .p2align 4 +L(Fill7): + movlpd %xmm0, -1(%edi) + RETURN + + .p2align 4 +L(Fill8): + movlpd %xmm0, (%edi) + RETURN + + .p2align 4 +L(Fill9): + movlpd %xmm0, (%edi) + movb %dl, 8(%edi) + RETURN + + .p2align 4 +L(Fill10): + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) + RETURN + + .p2align 4 +L(Fill11): + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) + RETURN + + .p2align 4 +L(Fill12): + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) + RETURN + + .p2align 4 +L(Fill13): + movlpd %xmm0, (%edi) + movlpd %xmm0, 5(%edi) + RETURN + + .p2align 4 +L(Fill14): + movlpd %xmm0, (%edi) + movlpd %xmm0, 6(%edi) + RETURN + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%edi) + RETURN + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%edi) + RETURN + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%edi, %ecx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %edx, %edx + add $15, %ebx + add %ecx, %edi +# ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +# endif + sub %edx, %ebx + lea 1(%edi, %edx), %edi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %edx, %edx + sub $16, %ebx + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%edi) + add $16, %edi + + mov %edi, %esi + and $0xf, %esi + sub %esi, %edi + add %esi, %ebx + sub $64, %ebx + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%edi) + movdqa %xmm0, 16(%edi) + movdqa %xmm0, 32(%edi) + movdqa %xmm0, 48(%edi) + add $64, %edi + sub $64, %ebx + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %ebx + jl L(StrncpyFillLess32) + movdqa %xmm0, (%edi) + movdqa %xmm0, 16(%edi) + add $32, %edi + sub $16, %ebx + jl L(StrncpyFillExit) + movdqa %xmm0, (%edi) + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + +L(StrncpyFillLess32): + add $16, %ebx + jl L(StrncpyFillExit) + movdqa %xmm0, (%edi) + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + +L(StrncpyFillExit): + add $16, %ebx + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%edi) +# ifdef USE_AS_STPCPY + lea 64(%edi), %eax +# endif + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm4, (%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm5) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm5, 16(%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm6) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm6, 32(%edi) + lea 16(%edi, %ecx), %edi + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(ExitZero): + movl %edi, %eax + RETURN + +END (STRCPY) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) + +L(ExitStrncpyTable): + .int JMPTBL(L(Exit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) + + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +# else +# define PARMS 4 +# define ENTRANCE +# define RETURN POP (%edi); ret; CFI_PUSH (%edi) +# define RETURN1 ret + + .text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edx + mov STR2(%esp), %ecx + + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + cmpb $0, 7(%ecx) + jz L(ExitTail8) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + cmpb $0, 14(%ecx) + jz L(ExitTail15) + cmpb $0, 15(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + PUSH (%ebx) + + mov %edx, %edi + lea 16(%ecx), %ebx + and $-16, %ebx + pxor %xmm0, %xmm0 + movdqu (%ecx), %xmm1 + movdqu %xmm1, (%edx) + pcmpeqb (%ebx), %xmm0 + pmovmskb %xmm0, %eax + sub %ecx, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %ecx, %eax + lea 16(%ecx), %ecx + and $-16, %ecx + sub %ecx, %eax + sub %eax, %edx + xor %ebx, %ebx + + .p2align 4 + movdqa (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movdqu %xmm1, (%edx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm3 + movdqu %xmm2, (%edx, %ebx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm4 + movdqu %xmm3, (%edx, %ebx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm1 + movdqu %xmm4, (%edx, %ebx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm2 + movdqu %xmm1, (%edx, %ebx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm3 + movdqu %xmm2, (%edx, %ebx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm3, (%edx, %ebx) + mov %ecx, %eax + lea 16(%ecx, %ebx), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps 32(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + add $64, %ecx + pminub %xmm7, %xmm3 + add $64, %edx + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(Aligned64Leave) +L(Aligned64Loop_start): + movdqu %xmm4, -64(%edx) + movaps (%ecx), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%edx) + movaps 16(%ecx), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%ecx), %xmm3 + movdqu %xmm6, -32(%edx) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%edx) + movaps 48(%ecx), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $64, %edx + add $64, %ecx + test %eax, %eax + jz L(Aligned64Loop_start) +L(Aligned64Leave): + sub $0xa0, %ebx + pxor %xmm0, %xmm0 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movdqu %xmm4, -64(%edx) + test %eax, %eax + lea 16(%ebx), %ebx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movdqu %xmm5, -48(%edx) + test %eax, %eax + lea 16(%ebx), %ebx + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm6, -32(%edx) + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%ebx), %ebx + +/*-----------------End of main part---------------------------*/ + + .p2align 4 +L(CopyFrom1To16Bytes): + add %ebx, %edx + add %ebx, %ecx + + POP (%ebx) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + /* Exit 8 */ + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + /* Exit 16 */ + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm0 + movlpd %xmm0, 8(%edx) +# ifdef USE_AS_STPCPY + lea 15(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) +# ifdef USE_AS_STPCPY + lea (%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) +# ifdef USE_AS_STPCPY + lea 1(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) +# ifdef USE_AS_STPCPY + lea 2(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) +# ifdef USE_AS_STPCPY + lea 3(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) +# ifdef USE_AS_STPCPY + lea 4(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 5(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) +# ifdef USE_AS_STPCPY + lea 6(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit9): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) +# ifdef USE_AS_STPCPY + lea 8(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit10): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 9(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit11): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) +# ifdef USE_AS_STPCPY + lea 10(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit12): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 11(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) +# ifdef USE_AS_STPCPY + lea 12(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) +# ifdef USE_AS_STPCPY + lea 13(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + +CFI_POP (%edi) + + .p2align 4 +L(ExitTail1): + movb (%ecx), %al + movb %al, (%edx) + movl %edx, %eax + RETURN1 + + .p2align 4 +L(ExitTail2): + movw (%ecx), %ax + movw %ax, (%edx) +# ifdef USE_AS_STPCPY + lea 1(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) +# ifdef USE_AS_STPCPY + lea 2(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) +# ifdef USE_AS_STPCPY + lea 3(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) +# ifdef USE_AS_STPCPY + lea 4(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 5(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) +# ifdef USE_AS_STPCPY + lea 6(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail8): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail9): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) +# ifdef USE_AS_STPCPY + lea 8(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail10): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 9(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail11): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) +# ifdef USE_AS_STPCPY + lea 10(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail12): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 11(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) +# ifdef USE_AS_STPCPY + lea 12(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) +# ifdef USE_AS_STPCPY + lea 13(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail16): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm0 + movlpd %xmm0, 8(%edx) +# ifdef USE_AS_STPCPY + lea 15(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + +END (STRCPY) +# endif + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S new file mode 100644 index 0000000000..effd85da94 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S @@ -0,0 +1,3901 @@ +/* strcpy with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCPY +# define STRCPY __strcpy_ssse3 +# endif + +# ifdef USE_AS_STRNCPY +# define PARMS 8 +# define ENTRANCE PUSH (%ebx) +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx); +# define RETURN1 POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi) +# else +# define PARMS 4 +# define ENTRANCE +# define RETURN ret +# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) +# endif + +# ifdef USE_AS_STPCPY +# define SAVE_RESULT(n) lea n(%edx), %eax +# define SAVE_RESULT_TAIL(n) lea n(%edx), %eax +# else +# define SAVE_RESULT(n) movl %edi, %eax +# define SAVE_RESULT_TAIL(n) movl %edx, %eax +# endif + +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +/* In this code following instructions are used for copying: + movb - 1 byte + movw - 2 byte + movl - 4 byte + movlpd - 8 byte + movaps - 16 byte - requires 16 byte alignment + of sourse and destination adresses. +*/ + +.text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edx + mov STR2(%esp), %ecx +# ifdef USE_AS_STRNCPY + movl LEN(%esp), %ebx + cmp $8, %ebx + jbe L(StrncpyExit8Bytes) +# endif + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + cmpb $0, 7(%ecx) + jz L(ExitTail8) +# ifdef USE_AS_STRNCPY + cmp $16, %ebx + jb L(StrncpyExit15Bytes) +# endif + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + cmpb $0, 14(%ecx) + jz L(ExitTail15) +# ifdef USE_AS_STRNCPY + cmp $16, %ebx + je L(ExitTail16) +# endif + cmpb $0, 15(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + mov %edx, %edi +# endif + PUSH (%esi) +# ifdef USE_AS_STRNCPY + mov %ecx, %esi + sub $16, %ebx + and $0xf, %esi + +/* add 16 bytes ecx_offset to ebx */ + + add %esi, %ebx +# endif + lea 16(%ecx), %esi + and $-16, %esi + pxor %xmm0, %xmm0 + movlpd (%ecx), %xmm1 + movlpd %xmm1, (%edx) + + pcmpeqb (%esi), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + + pmovmskb %xmm0, %eax + sub %ecx, %esi + +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %edx, %eax + lea 16(%edx), %edx + and $-16, %edx + sub %edx, %eax + +# ifdef USE_AS_STRNCPY + add %eax, %esi + lea -1(%esi), %esi + and $1<<31, %esi + test %esi, %esi + jnz L(ContinueCopy) + lea 16(%ebx), %ebx + +L(ContinueCopy): +# endif + sub %eax, %ecx + mov %ecx, %eax + and $0xf, %eax + mov $0, %esi + +/* case: ecx_offset == edx_offset */ + + jz L(Align16Both) + + cmp $8, %eax + jae L(ShlHigh8) + cmp $1, %eax + je L(Shl1) + cmp $2, %eax + je L(Shl2) + cmp $3, %eax + je L(Shl3) + cmp $4, %eax + je L(Shl4) + cmp $5, %eax + je L(Shl5) + cmp $6, %eax + je L(Shl6) + jmp L(Shl7) + +L(ShlHigh8): + je L(Shl8) + cmp $9, %eax + je L(Shl9) + cmp $10, %eax + je L(Shl10) + cmp $11, %eax + je L(Shl11) + cmp $12, %eax + je L(Shl12) + cmp $13, %eax + je L(Shl13) + cmp $14, %eax + je L(Shl14) + jmp L(Shl15) + +L(Align16Both): + movaps (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movaps %xmm1, (%edx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm4 + movaps %xmm3, (%edx, %esi) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm1 + movaps %xmm4, (%edx, %esi) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm2 + movaps %xmm1, (%edx, %esi) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%edx, %esi) + mov %ecx, %eax + lea 16(%ecx, %esi), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx +# ifdef USE_AS_STRNCPY + lea 112(%ebx, %eax), %ebx +# endif + mov $-0x40, %esi + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps 32(%ecx), %xmm3 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + lea 64(%edx), %edx + pcmpeqb %xmm0, %xmm3 + lea 64(%ecx), %ecx + pmovmskb %xmm3, %eax +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeaveCase2OrCase3) +# endif + test %eax, %eax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%edx) + movaps %xmm5, -48(%edx) + movaps %xmm6, -32(%edx) + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): +# ifdef USE_AS_STRNCPY + lea 48(%ebx), %ebx +# endif + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%edx) + pcmpeqb %xmm7, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl1): + movaps -1(%ecx), %xmm1 + movaps 15(%ecx), %xmm2 +L(Shl1Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 31(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -15(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -1(%ecx), %xmm1 + +L(Shl1LoopStart): + movaps 15(%ecx), %xmm2 + movaps 31(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 47(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 63(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + test %eax, %eax + palignr $1, %xmm3, %xmm4 + jnz L(Shl1Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave1) +# endif + palignr $1, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $1, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl1LoopStart) + +L(Shl1LoopExit): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + mov $15, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl2): + movaps -2(%ecx), %xmm1 + movaps 14(%ecx), %xmm2 +L(Shl2Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 30(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -14(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -2(%ecx), %xmm1 + +L(Shl2LoopStart): + movaps 14(%ecx), %xmm2 + movaps 30(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 46(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 62(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + test %eax, %eax + palignr $2, %xmm3, %xmm4 + jnz L(Shl2Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave2) +# endif + palignr $2, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $2, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl2LoopStart) + +L(Shl2LoopExit): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + mov $14, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl3): + movaps -3(%ecx), %xmm1 + movaps 13(%ecx), %xmm2 +L(Shl3Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 29(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -13(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -3(%ecx), %xmm1 + +L(Shl3LoopStart): + movaps 13(%ecx), %xmm2 + movaps 29(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 45(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 61(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + test %eax, %eax + palignr $3, %xmm3, %xmm4 + jnz L(Shl3Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave3) +# endif + palignr $3, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $3, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl3LoopStart) + +L(Shl3LoopExit): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + mov $13, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl4): + movaps -4(%ecx), %xmm1 + movaps 12(%ecx), %xmm2 +L(Shl4Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 28(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -12(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -4(%ecx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%ecx), %xmm2 + movaps 28(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %eax, %eax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave4) +# endif + palignr $4, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + mov $12, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl5): + movaps -5(%ecx), %xmm1 + movaps 11(%ecx), %xmm2 +L(Shl5Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 27(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -11(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -5(%ecx), %xmm1 + +L(Shl5LoopStart): + movaps 11(%ecx), %xmm2 + movaps 27(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 43(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 59(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + test %eax, %eax + palignr $5, %xmm3, %xmm4 + jnz L(Shl5Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave5) +# endif + palignr $5, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $5, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl5LoopStart) + +L(Shl5LoopExit): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 7(%edx) + mov $11, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl6): + movaps -6(%ecx), %xmm1 + movaps 10(%ecx), %xmm2 +L(Shl6Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 26(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -10(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -6(%ecx), %xmm1 + +L(Shl6LoopStart): + movaps 10(%ecx), %xmm2 + movaps 26(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 42(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 58(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + test %eax, %eax + palignr $6, %xmm3, %xmm4 + jnz L(Shl6Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave6) +# endif + palignr $6, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $6, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl6LoopStart) + +L(Shl6LoopExit): + movlpd (%ecx), %xmm0 + movl 6(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 6(%edx) + mov $10, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl7): + movaps -7(%ecx), %xmm1 + movaps 9(%ecx), %xmm2 +L(Shl7Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 25(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -9(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -7(%ecx), %xmm1 + +L(Shl7LoopStart): + movaps 9(%ecx), %xmm2 + movaps 25(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 41(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 57(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + test %eax, %eax + palignr $7, %xmm3, %xmm4 + jnz L(Shl7Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave7) +# endif + palignr $7, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $7, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl7LoopStart) + +L(Shl7LoopExit): + movlpd (%ecx), %xmm0 + movl 5(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 5(%edx) + mov $9, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%ecx), %xmm1 + movaps 8(%ecx), %xmm2 +L(Shl8Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 24(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -8(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -8(%ecx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%ecx), %xmm2 + movaps 24(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %eax, %eax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave8) +# endif + palignr $8, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + mov $8, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl9): + movaps -9(%ecx), %xmm1 + movaps 7(%ecx), %xmm2 +L(Shl9Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 23(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -7(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -9(%ecx), %xmm1 + +L(Shl9LoopStart): + movaps 7(%ecx), %xmm2 + movaps 23(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 39(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 55(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + test %eax, %eax + palignr $9, %xmm3, %xmm4 + jnz L(Shl9Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave9) +# endif + palignr $9, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $9, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl9LoopStart) + +L(Shl9LoopExit): + movlpd -1(%ecx), %xmm0 + movlpd %xmm0, -1(%edx) + mov $7, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl10): + movaps -10(%ecx), %xmm1 + movaps 6(%ecx), %xmm2 +L(Shl10Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 22(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -6(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -10(%ecx), %xmm1 + +L(Shl10LoopStart): + movaps 6(%ecx), %xmm2 + movaps 22(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 38(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 54(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + test %eax, %eax + palignr $10, %xmm3, %xmm4 + jnz L(Shl10Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave10) +# endif + palignr $10, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $10, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl10LoopStart) + +L(Shl10LoopExit): + movlpd -2(%ecx), %xmm0 + movlpd %xmm0, -2(%edx) + mov $6, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl11): + movaps -11(%ecx), %xmm1 + movaps 5(%ecx), %xmm2 +L(Shl11Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 21(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -5(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -11(%ecx), %xmm1 + +L(Shl11LoopStart): + movaps 5(%ecx), %xmm2 + movaps 21(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 37(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 53(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + test %eax, %eax + palignr $11, %xmm3, %xmm4 + jnz L(Shl11Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave11) +# endif + palignr $11, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $11, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl11LoopStart) + +L(Shl11LoopExit): + movlpd -3(%ecx), %xmm0 + movlpd %xmm0, -3(%edx) + mov $5, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%ecx), %xmm1 + movaps 4(%ecx), %xmm2 +L(Shl12Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 20(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -4(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -12(%ecx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%ecx), %xmm2 + movaps 20(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %eax, %eax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave12) +# endif + palignr $12, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movl (%ecx), %esi + movl %esi, (%edx) + mov $4, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl13): + movaps -13(%ecx), %xmm1 + movaps 3(%ecx), %xmm2 +L(Shl13Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 19(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -3(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -13(%ecx), %xmm1 + +L(Shl13LoopStart): + movaps 3(%ecx), %xmm2 + movaps 19(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 35(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 51(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + test %eax, %eax + palignr $13, %xmm3, %xmm4 + jnz L(Shl13Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave13) +# endif + palignr $13, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $13, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl13LoopStart) + +L(Shl13LoopExit): + movl -1(%ecx), %esi + movl %esi, -1(%edx) + mov $3, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl14): + movaps -14(%ecx), %xmm1 + movaps 2(%ecx), %xmm2 +L(Shl14Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 18(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -2(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -14(%ecx), %xmm1 + +L(Shl14LoopStart): + movaps 2(%ecx), %xmm2 + movaps 18(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 34(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 50(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + test %eax, %eax + palignr $14, %xmm3, %xmm4 + jnz L(Shl14Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave14) +# endif + palignr $14, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $14, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl14LoopStart) + +L(Shl14LoopExit): + movl -2(%ecx), %esi + movl %esi, -2(%edx) + mov $2, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl15): + movaps -15(%ecx), %xmm1 + movaps 1(%ecx), %xmm2 +L(Shl15Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 17(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -1(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -15(%ecx), %xmm1 + +L(Shl15LoopStart): + movaps 1(%ecx), %xmm2 + movaps 17(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 33(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 49(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + test %eax, %eax + palignr $15, %xmm3, %xmm4 + jnz L(Shl15Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave15) +# endif + palignr $15, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $15, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl15LoopStart) + +L(Shl15LoopExit): + movl -3(%ecx), %esi + movl %esi, -3(%edx) + mov $1, %esi +# ifdef USE_AS_STRCAT + jmp L(CopyFrom1To16Bytes) +# endif + + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(CopyFrom1To16Bytes): +# ifdef USE_AS_STRNCPY + add $16, %ebx +# endif + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh8) + +L(CopyFrom1To16BytesLess8): + mov %al, %ah + and $15, %ah + jz L(ExitHigh4) + + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT (3) +# ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh4): + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + + .p2align 4 +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + SAVE_RESULT (7) +# ifdef USE_AS_STRNCPY + sub $8, %ebx + lea 8(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh8): + mov %ah, %al + and $15, %al + jz L(ExitHigh12) + + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT (11) +# ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh12): + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + + .p2align 4 +L(Exit16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + SAVE_RESULT (15) +# ifdef USE_AS_STRNCPY + sub $16, %ebx + lea 16(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + +# ifdef USE_AS_STRNCPY + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %esi, %ecx + add %esi, %edx + + POP (%esi) + + test %al, %al + jz L(ExitHighCase2) + + cmp $8, %ebx + ja L(CopyFrom1To16BytesLess8) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %ebx + je L(Exit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %ebx + je L(Exit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %ebx + je L(Exit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %ebx + je L(Exit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %ebx + je L(Exit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %ebx + je L(Exit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %ebx + je L(Exit7) + jmp L(Exit8) + + .p2align 4 +L(ExitHighCase2): + cmp $8, %ebx + jbe L(CopyFrom1To16BytesLess8Case3) + + test $0x01, %ah + jnz L(Exit9) + cmp $9, %ebx + je L(Exit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %ebx + je L(Exit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %ebx + je L(Exit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %ebx + je L(Exit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %ebx + je L(Exit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %ebx + je L(Exit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %ebx + je L(Exit15) + jmp L(Exit16) + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %esi, %edx + add %esi, %ecx + + POP (%esi) + + cmp $8, %ebx + ja L(ExitHigh8Case3) + +L(CopyFrom1To16BytesLess8Case3): + cmp $4, %ebx + ja L(ExitHigh4Case3) + + cmp $1, %ebx + je L(Exit1) + cmp $2, %ebx + je L(Exit2) + cmp $3, %ebx + je L(Exit3) + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT (4) + RETURN1 + + .p2align 4 +L(ExitHigh4Case3): + cmp $5, %ebx + je L(Exit5) + cmp $6, %ebx + je L(Exit6) + cmp $7, %ebx + je L(Exit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + SAVE_RESULT (8) + RETURN1 + + .p2align 4 +L(ExitHigh8Case3): + cmp $12, %ebx + ja L(ExitHigh12Case3) + + cmp $9, %ebx + je L(Exit9) + cmp $10, %ebx + je L(Exit10) + cmp $11, %ebx + je L(Exit11) + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT (12) + RETURN1 + + .p2align 4 +L(ExitHigh12Case3): + cmp $13, %ebx + je L(Exit13) + cmp $14, %ebx + je L(Exit14) + cmp $15, %ebx + je L(Exit15) + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + SAVE_RESULT (16) + RETURN1 + +# endif + + .p2align 4 +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) + SAVE_RESULT (0) +# ifdef USE_AS_STRNCPY + sub $1, %ebx + lea 1(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) + SAVE_RESULT (1) +# ifdef USE_AS_STRNCPY + sub $2, %ebx + lea 2(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + SAVE_RESULT (2) +# ifdef USE_AS_STRNCPY + sub $3, %ebx + lea 3(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + SAVE_RESULT (4) +# ifdef USE_AS_STRNCPY + sub $5, %ebx + lea 5(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + SAVE_RESULT (5) +# ifdef USE_AS_STRNCPY + sub $6, %ebx + lea 6(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + SAVE_RESULT (6) +# ifdef USE_AS_STRNCPY + sub $7, %ebx + lea 7(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit9): + movlpd (%ecx), %xmm0 + movb 8(%ecx), %al + movlpd %xmm0, (%edx) + movb %al, 8(%edx) + SAVE_RESULT (8) +# ifdef USE_AS_STRNCPY + sub $9, %ebx + lea 9(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit10): + movlpd (%ecx), %xmm0 + movw 8(%ecx), %ax + movlpd %xmm0, (%edx) + movw %ax, 8(%edx) + SAVE_RESULT (9) +# ifdef USE_AS_STRNCPY + sub $10, %ebx + lea 10(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit11): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 7(%edx) + SAVE_RESULT (10) +# ifdef USE_AS_STRNCPY + sub $11, %ebx + lea 11(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + SAVE_RESULT (12) +# ifdef USE_AS_STRNCPY + sub $13, %ebx + lea 13(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + SAVE_RESULT (13) +# ifdef USE_AS_STRNCPY + sub $14, %ebx + lea 14(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) + SAVE_RESULT (14) +# ifdef USE_AS_STRNCPY + sub $15, %ebx + lea 15(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + +CFI_POP (%edi) + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + movb %dl, (%ecx) + RETURN + + .p2align 4 +L(Fill2): + movw %dx, (%ecx) + RETURN + + .p2align 4 +L(Fill3): + movw %dx, (%ecx) + movb %dl, 2(%ecx) + RETURN + + .p2align 4 +L(Fill4): + movl %edx, (%ecx) + RETURN + + .p2align 4 +L(Fill5): + movl %edx, (%ecx) + movb %dl, 4(%ecx) + RETURN + + .p2align 4 +L(Fill6): + movl %edx, (%ecx) + movw %dx, 4(%ecx) + RETURN + + .p2align 4 +L(Fill7): + movl %edx, (%ecx) + movl %edx, 3(%ecx) + RETURN + + .p2align 4 +L(Fill8): + movlpd %xmm0, (%ecx) + RETURN + + .p2align 4 +L(Fill9): + movlpd %xmm0, (%ecx) + movb %dl, 8(%ecx) + RETURN + + .p2align 4 +L(Fill10): + movlpd %xmm0, (%ecx) + movw %dx, 8(%ecx) + RETURN + + .p2align 4 +L(Fill11): + movlpd %xmm0, (%ecx) + movl %edx, 7(%ecx) + RETURN + + .p2align 4 +L(Fill12): + movlpd %xmm0, (%ecx) + movl %edx, 8(%ecx) + RETURN + + .p2align 4 +L(Fill13): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 5(%ecx) + RETURN + + .p2align 4 +L(Fill14): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 6(%ecx) + RETURN + + .p2align 4 +L(Fill15): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 7(%ecx) + RETURN + + .p2align 4 +L(Fill16): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 8(%ecx) + RETURN + + .p2align 4 +L(StrncpyFillExit1): + lea 16(%ebx), %ebx +L(FillFrom1To16Bytes): + test %ebx, %ebx + jz L(Fill0) + cmp $16, %ebx + je L(Fill16) + cmp $8, %ebx + je L(Fill8) + jg L(FillMore8) + cmp $4, %ebx + je L(Fill4) + jg L(FillMore4) + cmp $2, %ebx + jl L(Fill1) + je L(Fill2) + jg L(Fill3) +L(FillMore8): /* but less than 16 */ + cmp $12, %ebx + je L(Fill12) + jl L(FillLess12) + cmp $14, %ebx + jl L(Fill13) + je L(Fill14) + jg L(Fill15) +L(FillMore4): /* but less than 8 */ + cmp $6, %ebx + jl L(Fill5) + je L(Fill6) + jg L(Fill7) +L(FillLess12): /* but more than 8 */ + cmp $10, %ebx + jl L(Fill9) + je L(Fill10) + jmp L(Fill11) + + CFI_PUSH(%edi) + + .p2align 4 +L(StrncpyFillTailWithZero1): + POP (%edi) +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %edx, %edx + sub $16, %ebx + jbe L(StrncpyFillExit1) + + movlpd %xmm0, (%ecx) + movlpd %xmm0, 8(%ecx) + + lea 16(%ecx), %ecx + + mov %ecx, %edx + and $0xf, %edx + sub %edx, %ecx + add %edx, %ebx + xor %edx, %edx + sub $64, %ebx + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%ecx) + movdqa %xmm0, 16(%ecx) + movdqa %xmm0, 32(%ecx) + movdqa %xmm0, 48(%ecx) + lea 64(%ecx), %ecx + sub $64, %ebx + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %ebx + jl L(StrncpyFillLess32) + movdqa %xmm0, (%ecx) + movdqa %xmm0, 16(%ecx) + lea 32(%ecx), %ecx + sub $16, %ebx + jl L(StrncpyFillExit1) + movdqa %xmm0, (%ecx) + lea 16(%ecx), %ecx + jmp L(FillFrom1To16Bytes) + +L(StrncpyFillLess32): + add $16, %ebx + jl L(StrncpyFillExit1) + movdqa %xmm0, (%ecx) + lea 16(%ecx), %ecx + jmp L(FillFrom1To16Bytes) +# endif + + .p2align 4 +L(ExitTail1): + movb (%ecx), %al + movb %al, (%edx) + SAVE_RESULT_TAIL (0) +# ifdef USE_AS_STRNCPY + sub $1, %ebx + lea 1(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail2): + movw (%ecx), %ax + movw %ax, (%edx) + SAVE_RESULT_TAIL (1) +# ifdef USE_AS_STRNCPY + sub $2, %ebx + lea 2(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + SAVE_RESULT_TAIL (2) +# ifdef USE_AS_STRNCPY + sub $3, %ebx + lea 3(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT_TAIL (3) +# ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + SAVE_RESULT_TAIL (4) +# ifdef USE_AS_STRNCPY + sub $5, %ebx + lea 5(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + SAVE_RESULT_TAIL (5) +# ifdef USE_AS_STRNCPY + sub $6, %ebx + lea 6(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + SAVE_RESULT_TAIL (6) +# ifdef USE_AS_STRNCPY + sub $7, %ebx + lea 7(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + SAVE_RESULT_TAIL (7) +# ifdef USE_AS_STRNCPY + sub $8, %ebx + lea 8(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# endif + RETURN + + .p2align 4 +L(ExitTail9): + movlpd (%ecx), %xmm0 + movb 8(%ecx), %al + movlpd %xmm0, (%edx) + movb %al, 8(%edx) + SAVE_RESULT_TAIL (8) +# ifdef USE_AS_STRNCPY + sub $9, %ebx + lea 9(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail10): + movlpd (%ecx), %xmm0 + movw 8(%ecx), %ax + movlpd %xmm0, (%edx) + movw %ax, 8(%edx) + SAVE_RESULT_TAIL (9) +# ifdef USE_AS_STRNCPY + sub $10, %ebx + lea 10(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail11): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 7(%edx) + SAVE_RESULT_TAIL (10) +# ifdef USE_AS_STRNCPY + sub $11, %ebx + lea 11(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail12): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT_TAIL (11) +# ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail13): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + SAVE_RESULT_TAIL (12) +# ifdef USE_AS_STRNCPY + sub $13, %ebx + lea 13(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail14): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + SAVE_RESULT_TAIL (13) +# ifdef USE_AS_STRNCPY + sub $14, %ebx + lea 14(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail15): + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) + SAVE_RESULT_TAIL (14) +# ifdef USE_AS_STRNCPY + sub $15, %ebx + lea 15(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# endif + RETURN + + .p2align 4 +L(ExitTail16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + SAVE_RESULT_TAIL (15) +# ifdef USE_AS_STRNCPY + sub $16, %ebx + lea 16(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN +# endif + +# ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + CFI_PUSH (%esi) + CFI_PUSH (%edi) +# endif + .p2align 4 +L(StrncpyLeaveCase2OrCase3): + test %eax, %eax + jnz L(Aligned64LeaveCase2) + +L(Aligned64LeaveCase3): + add $48, %ebx + jle L(CopyFrom1To16BytesCase3) + movaps %xmm4, -64(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm5, -48(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm6, -32(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + jmp L(CopyFrom1To16BytesCase3) + +L(Aligned64LeaveCase2): + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm6, -32(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + jmp L(CopyFrom1To16BytesCase2) + +/*--------------------------------------------------*/ + .p2align 4 +L(StrncpyExit1Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) + mov $15, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit2Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + mov $14, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit3Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + mov $13, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit4Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + mov $12, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit5Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 7(%edx) + mov $11, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit6Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 6(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 6(%edx) + mov $10, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit7Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 5(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 5(%edx) + mov $9, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit8Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + mov $8, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit9Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + mov $7, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit10Case2OrCase3): + movlpd -1(%ecx), %xmm0 + movlpd %xmm0, -1(%edx) + mov $6, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit11Case2OrCase3): + movlpd -2(%ecx), %xmm0 + movlpd %xmm0, -2(%edx) + mov $5, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit12Case2OrCase3): + movl (%ecx), %esi + movl %esi, (%edx) + mov $4, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit13Case2OrCase3): + movl -1(%ecx), %esi + movl %esi, -1(%edx) + mov $3, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit14Case2OrCase3): + movl -2(%ecx), %esi + movl %esi, -2(%edx) + mov $2, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit15Case2OrCase3): + movl -3(%ecx), %esi + movl %esi, -3(%edx) + mov $1, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave1): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + palignr $1, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit1): + lea 15(%edx, %esi), %edx + lea 15(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave2): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + palignr $2, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit2): + lea 14(%edx, %esi), %edx + lea 14(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave3): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + palignr $3, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit3): + lea 13(%edx, %esi), %edx + lea 13(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave4): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + palignr $4, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit4): + lea 12(%edx, %esi), %edx + lea 12(%ecx, %esi), %ecx + movlpd -12(%ecx), %xmm0 + movl -4(%ecx), %eax + movlpd %xmm0, -12(%edx) + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave5): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + palignr $5, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit5): + lea 11(%edx, %esi), %edx + lea 11(%ecx, %esi), %ecx + movlpd -11(%ecx), %xmm0 + movl -4(%ecx), %eax + movlpd %xmm0, -11(%edx) + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave6): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + palignr $6, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit6): + lea 10(%edx, %esi), %edx + lea 10(%ecx, %esi), %ecx + + movlpd -10(%ecx), %xmm0 + movw -2(%ecx), %ax + movlpd %xmm0, -10(%edx) + movw %ax, -2(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave7): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + palignr $7, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit7): + lea 9(%edx, %esi), %edx + lea 9(%ecx, %esi), %ecx + + movlpd -9(%ecx), %xmm0 + movb -1(%ecx), %ah + movlpd %xmm0, -9(%edx) + movb %ah, -1(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave8): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + palignr $8, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit8): + lea 8(%edx, %esi), %edx + lea 8(%ecx, %esi), %ecx + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave9): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + palignr $9, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit9): + lea 7(%edx, %esi), %edx + lea 7(%ecx, %esi), %ecx + + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave10): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + palignr $10, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit10): + lea 6(%edx, %esi), %edx + lea 6(%ecx, %esi), %ecx + + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave11): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + palignr $11, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit11): + lea 5(%edx, %esi), %edx + lea 5(%ecx, %esi), %ecx + movl -5(%ecx), %esi + movb -1(%ecx), %ah + movl %esi, -5(%edx) + movb %ah, -1(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave12): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + palignr $12, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit12): + lea 4(%edx, %esi), %edx + lea 4(%ecx, %esi), %ecx + movl -4(%ecx), %eax + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave13): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + palignr $13, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit13): + lea 3(%edx, %esi), %edx + lea 3(%ecx, %esi), %ecx + + movl -4(%ecx), %eax + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave14): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + palignr $14, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit14): + lea 2(%edx, %esi), %edx + lea 2(%ecx, %esi), %ecx + movw -2(%ecx), %ax + movw %ax, -2(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave15): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + palignr $15, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit15): + lea 1(%edx, %esi), %edx + lea 1(%ecx, %esi), %ecx + movb -1(%ecx), %ah + movb %ah, -1(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) +# endif + +# ifndef USE_AS_STRCAT +# ifdef USE_AS_STRNCPY + CFI_POP (%esi) + CFI_POP (%edi) + + .p2align 4 +L(ExitTail0): + movl %edx, %eax + RETURN + + .p2align 4 +L(StrncpyExit15Bytes): + cmp $12, %ebx + jbe L(StrncpyExit12Bytes) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmp $13, %ebx + je L(ExitTail13) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmp $14, %ebx + je L(ExitTail14) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax +# else + movl %edx, %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit12Bytes): + cmp $9, %ebx + je L(ExitTail9) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmp $10, %ebx + je L(ExitTail10) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmp $11, %ebx + je L(ExitTail11) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT_TAIL (11) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit8Bytes): + cmp $4, %ebx + jbe L(StrncpyExit4Bytes) + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + + cmp $5, %ebx + je L(ExitTail5) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmp $6, %ebx + je L(ExitTail6) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmp $7, %ebx + je L(ExitTail7) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax +# else + movl %edx, %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit4Bytes): + test %ebx, %ebx + jz L(ExitTail0) + cmp $1, %ebx + je L(ExitTail1) + cmpb $0, (%ecx) + jz L(ExitTail1) + cmp $2, %ebx + je L(ExitTail2) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmp $3, %ebx + je L(ExitTail3) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT_TAIL (3) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif + RETURN +# endif + +END (STRCPY) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S new file mode 100644 index 0000000000..ffbc03c6d5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S @@ -0,0 +1,116 @@ +/* Multiple versions of strcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define STRCPY_IA32 __stpncpy_ia32 +# define __GI_STRCPY __GI_stpncpy +# define __GI___STRCPY __GI___stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define STRCPY_IA32 __stpcpy_ia32 +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define STRCPY_IA32 __strncpy_ia32 +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define STRCPY_IA32 __strcpy_ia32 +# define __GI_STRCPY __GI_strcpy +# endif +#endif + + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncpy in static library since we + need strncpy before the initialization happened. */ +#if IS_IN (libc) + + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (STRCPY_IA32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (STRCPY_SSE2) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (STRCPY_SSSE3) +2: ret +END(STRCPY) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_IA32, @function; \ + .align 16; \ + .globl STRCPY_IA32; \ + .hidden STRCPY_IA32; \ + STRCPY_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32 + +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# include "../../stpncpy.S" +# else +# include "../../i586/stpcpy.S" +# endif +#else +# ifndef USE_AS_STRNCPY +# include "../../i586/strcpy.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c new file mode 100644 index 0000000000..6d61e190a8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c @@ -0,0 +1,2 @@ +#define __strcspn_sse2 __strcspn_ia32 +#include <sysdeps/x86_64/multiarch/strcspn-c.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S new file mode 100644 index 0000000000..21e5093924 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S @@ -0,0 +1,75 @@ +/* Multiple versions of strcspn + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <init-arch.h> + +#ifdef USE_AS_STRPBRK +#define STRCSPN_SSE42 __strpbrk_sse42 +#define STRCSPN_IA32 __strpbrk_ia32 +#define __GI_STRCSPN __GI_strpbrk +#else +#ifndef STRCSPN +#define STRCSPN strcspn +#define STRCSPN_SSE42 __strcspn_sse42 +#define STRCSPN_IA32 __strcspn_ia32 +#define __GI_STRCSPN __GI_strcspn +#endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc) + .text +ENTRY(STRCSPN) + .type STRCSPN, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (STRCSPN_IA32) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (STRCSPN_SSE42) +2: ret +END(STRCSPN) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCSPN_IA32, @function; \ + .globl STRCSPN_IA32; \ + .p2align 4; \ + STRCSPN_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32 +#endif + +#ifdef USE_AS_STRPBRK +#include "../../strpbrk.S" +#else +#include "../../strcspn.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S new file mode 100644 index 0000000000..d3ea864bab --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S @@ -0,0 +1,125 @@ +/* strlen with SSE2 and BSF + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if defined SHARED && IS_IN (libc) + +#include <sysdep.h> + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) +#define PARMS 4 + 8 /* Preserve ESI and EDI. */ +#define STR PARMS +#define ENTRANCE PUSH (%esi); PUSH (%edi); cfi_remember_state +#define RETURN POP (%edi); POP (%esi); ret; \ + cfi_restore_state; cfi_remember_state + + .text +ENTRY ( __strlen_sse2_bsf) + ENTRANCE + mov STR(%esp), %edi + xor %eax, %eax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%edi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %edi, %eax + and $-16, %eax + jmp L(align16_start) +L(next): + + mov %edi, %eax + and $-16, %eax + pcmpeqb (%eax), %xmm0 + mov $-1, %esi + sub %eax, %ecx + shl %cl, %esi + pmovmskb %xmm0, %edx + and %esi, %edx + jnz L(exit) +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + .p2align 4 +L(align16_loop): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop) +L(exit): + sub %edi, %eax +L(exit_less16): + bsf %edx, %edx + add %edx, %eax + RETURN +L(exit16): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $16, %eax + RETURN +L(exit32): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $32, %eax + RETURN +L(exit48): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $48, %eax + POP (%edi) + POP (%esi) + ret + +END ( __strlen_sse2_bsf) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S new file mode 100644 index 0000000000..36fc1469d0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S @@ -0,0 +1,695 @@ +/* strlen with SSE2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */ + +#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc) + +# ifndef USE_AS_STRCAT + +# include <sysdep.h> +# define PARMS 4 +# define STR PARMS +# define RETURN ret + +# ifdef USE_AS_STRNLEN +# define LEN PARMS + 8 +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) +# undef RETURN +# define RETURN POP (%edi); CFI_PUSH(%edi); ret +# endif + +# ifndef STRLEN +# define STRLEN __strlen_sse2 +# endif + + atom_text_section +ENTRY (STRLEN) + mov STR(%esp), %edx +# ifdef USE_AS_STRNLEN + PUSH (%edi) + movl LEN(%esp), %edi + sub $4, %edi + jbe L(len_less4_prolog) +# endif +# endif + xor %eax, %eax + cmpb $0, (%edx) + jz L(exit_tail0) + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmpb $0, 3(%edx) + jz L(exit_tail3) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less8_prolog) +# endif + + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmpb $0, 7(%edx) + jz L(exit_tail7) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less12_prolog) +# endif + + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmpb $0, 11(%edx) + jz L(exit_tail11) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less16_prolog) +# endif + + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmpb $0, 15(%edx) + jz L(exit_tail15) + + pxor %xmm0, %xmm0 + lea 16(%edx), %eax + mov %eax, %ecx + and $-16, %eax + +# ifdef USE_AS_STRNLEN + and $15, %edx + add %edx, %edi + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + mov %eax, %edx + and $63, %edx + add %edx, %edi +# endif + + and $-0x40, %eax + + .p2align 4 +L(aligned_64_loop): +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + movaps (%eax), %xmm0 + movaps 16(%eax), %xmm1 + movaps 32(%eax), %xmm2 + movaps 48(%eax), %xmm6 + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqb %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%eax), %eax + jz L(aligned_64_loop) + + pcmpeqb -64(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%ecx), %ecx + jnz L(exit) + + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqb -32(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqb %xmm6, %xmm3 + pmovmskb %xmm3, %edx + lea -16(%ecx), %ecx +L(exit): + sub %ecx, %eax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(exit_tail1) + test $0x04, %dl + jnz L(exit_tail2) + add $3, %eax + RETURN + + .p2align 4 +L(exit_8): + test $0x10, %dl + jnz L(exit_tail4) + test $0x20, %dl + jnz L(exit_tail5) + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax + RETURN + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_high_8) + test $0x01, %dh + jnz L(exit_tail8) + test $0x02, %dh + jnz L(exit_tail9) + test $0x04, %dh + jnz L(exit_tail10) + add $11, %eax + RETURN + + .p2align 4 +L(exit_high_8): + test $0x10, %dh + jnz L(exit_tail12) + test $0x20, %dh + jnz L(exit_tail13) + test $0x40, %dh + jnz L(exit_tail14) + add $15, %eax +L(exit_tail0): + RETURN + +# ifdef USE_AS_STRNLEN + + .p2align 4 +L(len_less64): + pxor %xmm0, %xmm0 + add $64, %edi + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + movl LEN(%esp), %eax + RETURN + + .p2align 4 +L(strnlen_exit): + sub %ecx, %eax + + test %dl, %dl + jz L(strnlen_exit_high) + mov %dl, %cl + and $15, %cl + jz L(strnlen_exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(strnlen_exit_tail1) + test $0x04, %dl + jnz L(strnlen_exit_tail2) + sub $4, %edi + jb L(return_start_len) + lea 3(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_8): + test $0x10, %dl + jnz L(strnlen_exit_tail4) + test $0x20, %dl + jnz L(strnlen_exit_tail5) + test $0x40, %dl + jnz L(strnlen_exit_tail6) + sub $8, %edi + jb L(return_start_len) + lea 7(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_high): + mov %dh, %ch + and $15, %ch + jz L(strnlen_exit_high_8) + test $0x01, %dh + jnz L(strnlen_exit_tail8) + test $0x02, %dh + jnz L(strnlen_exit_tail9) + test $0x04, %dh + jnz L(strnlen_exit_tail10) + sub $12, %edi + jb L(return_start_len) + lea 11(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_high_8): + test $0x10, %dh + jnz L(strnlen_exit_tail12) + test $0x20, %dh + jnz L(strnlen_exit_tail13) + test $0x40, %dh + jnz L(strnlen_exit_tail14) + sub $16, %edi + jb L(return_start_len) + lea 15(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail1): + sub $2, %edi + jb L(return_start_len) + lea 1(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail2): + sub $3, %edi + jb L(return_start_len) + lea 2(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail4): + sub $5, %edi + jb L(return_start_len) + lea 4(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail5): + sub $6, %edi + jb L(return_start_len) + lea 5(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail6): + sub $7, %edi + jb L(return_start_len) + lea 6(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail8): + sub $9, %edi + jb L(return_start_len) + lea 8(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail9): + sub $10, %edi + jb L(return_start_len) + lea 9(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail10): + sub $11, %edi + jb L(return_start_len) + lea 10(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail12): + sub $13, %edi + jb L(return_start_len) + lea 12(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail13): + sub $14, %edi + jb L(return_start_len) + lea 13(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail14): + sub $15, %edi + jb L(return_start_len) + lea 14(%eax), %eax + RETURN + + .p2align 4 +L(return_start_len): + movl LEN(%esp), %eax + RETURN + +/* for prolog only */ + + .p2align 4 +L(len_less4_prolog): + xor %eax, %eax + + add $4, %edi + jz L(exit_tail0) + + cmpb $0, (%edx) + jz L(exit_tail0) + cmp $1, %edi + je L(exit_tail1) + + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmp $2, %edi + je L(exit_tail2) + + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmp $3, %edi + je L(exit_tail3) + + cmpb $0, 3(%edx) + jz L(exit_tail3) + mov $4, %eax + RETURN + + .p2align 4 +L(len_less8_prolog): + add $4, %edi + + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmp $1, %edi + je L(exit_tail5) + + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmp $2, %edi + je L(exit_tail6) + + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmp $3, %edi + je L(exit_tail7) + + cmpb $0, 7(%edx) + jz L(exit_tail7) + mov $8, %eax + RETURN + + + .p2align 4 +L(len_less12_prolog): + add $4, %edi + + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmp $1, %edi + je L(exit_tail9) + + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmp $2, %edi + je L(exit_tail10) + + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmp $3, %edi + je L(exit_tail11) + + cmpb $0, 11(%edx) + jz L(exit_tail11) + mov $12, %eax + RETURN + + .p2align 4 +L(len_less16_prolog): + add $4, %edi + + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmp $1, %edi + je L(exit_tail13) + + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmp $2, %edi + je L(exit_tail14) + + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmp $3, %edi + je L(exit_tail15) + + cmpb $0, 15(%edx) + jz L(exit_tail15) + mov $16, %eax + RETURN +# endif + + .p2align 4 +L(exit_tail1): + add $1, %eax + RETURN + +L(exit_tail2): + add $2, %eax + RETURN + +L(exit_tail3): + add $3, %eax + RETURN + +L(exit_tail4): + add $4, %eax + RETURN + +L(exit_tail5): + add $5, %eax + RETURN + +L(exit_tail6): + add $6, %eax + RETURN + +L(exit_tail7): + add $7, %eax + RETURN + +L(exit_tail8): + add $8, %eax + RETURN + +L(exit_tail9): + add $9, %eax + RETURN + +L(exit_tail10): + add $10, %eax + RETURN + +L(exit_tail11): + add $11, %eax + RETURN + +L(exit_tail12): + add $12, %eax + RETURN + +L(exit_tail13): + add $13, %eax + RETURN + +L(exit_tail14): + add $14, %eax + RETURN + +L(exit_tail15): + add $15, %eax +# ifndef USE_AS_STRCAT + RETURN +END (STRLEN) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S new file mode 100644 index 0000000000..77cf6bcdb0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S @@ -0,0 +1,60 @@ +/* Multiple versions of strlen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc and for the + DSO. In static binaries, we need strlen before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(strlen) + .type strlen, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strlen_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf) + HAS_ARCH_FEATURE (Slow_BSF) + jz 2f + LOAD_FUNC_GOT_EAX (__strlen_sse2) +2: ret +END(strlen) + +# undef ENTRY +# define ENTRY(name) \ + .type __strlen_ia32, @function; \ + .globl __strlen_ia32; \ + .p2align 4; \ + __strlen_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strlen_ia32, .-__strlen_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strlen; __GI_strlen = __strlen_ia32 +#endif + +#include "../../i586/strlen.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c new file mode 100644 index 0000000000..76581eb62b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c @@ -0,0 +1,8 @@ +#include <string.h> + +extern __typeof (strncasecmp) __strncasecmp_nonascii; + +#define __strncasecmp __strncasecmp_nonascii +#include <string/strncase.c> + +strong_alias (__strncasecmp_nonascii, __strncasecmp_ia32) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S new file mode 100644 index 0000000000..a56e63a566 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S @@ -0,0 +1,39 @@ +/* Entry point for multi-version x86 strncasecmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY(__strncasecmp) + .type __strncasecmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strncasecmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2) +2: ret +END(__strncasecmp) + +weak_alias (__strncasecmp, strncasecmp) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c new file mode 100644 index 0000000000..7e601af271 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c @@ -0,0 +1,13 @@ +#include <string.h> + +extern __typeof (strncasecmp_l) __strncasecmp_l_nonascii; + +#define __strncasecmp_l __strncasecmp_l_nonascii +#define USE_IN_EXTENDED_LOCALE_MODEL 1 +#include <string/strncase.c> + +strong_alias (__strncasecmp_l_nonascii, __strncasecmp_l_ia32) + +/* The needs of strcasecmp in libc are minimal, no need to go through + the IFUNC. */ +strong_alias (__strncasecmp_l_nonascii, __GI___strncasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S new file mode 100644 index 0000000000..557210832e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S @@ -0,0 +1,2 @@ +#define USE_AS_STRNCASECMP_L 1 +#include "strcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S new file mode 100644 index 0000000000..d438a1ae35 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S @@ -0,0 +1,2 @@ +#define USE_AS_STRNCASECMP_L 1 +#include "strcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S new file mode 100644 index 0000000000..8a74ee8574 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S @@ -0,0 +1,7 @@ +/* Multiple versions of strncasecmp_l + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP __strncasecmp_l +#define USE_AS_STRNCASECMP_L +#include "strcmp.S" + +weak_alias (__strncasecmp_l, strncasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c new file mode 100644 index 0000000000..132a000545 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c @@ -0,0 +1,8 @@ +#define STRNCAT __strncat_ia32 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32); +#endif + +#include "string/strncat.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S new file mode 100644 index 0000000000..f1045b72b8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S @@ -0,0 +1,4 @@ +#define STRCAT __strncat_sse2 +#define USE_AS_STRNCAT + +#include "strcat-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S new file mode 100644 index 0000000000..625b90a978 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S @@ -0,0 +1,4 @@ +#define STRCAT __strncat_ssse3 +#define USE_AS_STRNCAT + +#include "strcat-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S new file mode 100644 index 0000000000..5c1bf41453 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncat + All versions must be listed in ifunc-impl-list.c. */ +#define STRCAT strncat +#define USE_AS_STRNCAT +#include "strcat.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c new file mode 100644 index 0000000000..cc059da494 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c @@ -0,0 +1,8 @@ +#ifdef SHARED +# define STRNCMP __strncmp_ia32 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32); +#endif + +#include "string/strncmp.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S new file mode 100644 index 0000000000..cf14dfaf6c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S @@ -0,0 +1,5 @@ +#ifdef SHARED +# define USE_AS_STRNCMP +# define STRCMP __strncmp_sse4_2 +# include "strcmp-sse4.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S new file mode 100644 index 0000000000..536c8685f2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S @@ -0,0 +1,5 @@ +#ifdef SHARED +# define USE_AS_STRNCMP +# define STRCMP __strncmp_ssse3 +# include "strcmp-ssse3.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S new file mode 100644 index 0000000000..150d4786d2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncmp + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STRNCMP +#define STRCMP strncmp +#include "strcmp.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c new file mode 100644 index 0000000000..201e3f98b3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_ia32 +#ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32); +#endif + +#include "string/strncpy.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S new file mode 100644 index 0000000000..bdd99239a4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_sse2 +#include "strcpy-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S new file mode 100644 index 0000000000..bf82ee447d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S new file mode 100644 index 0000000000..9c257efc6e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncpy + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STRNCPY +#define STRCPY strncpy +#include "strcpy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c new file mode 100644 index 0000000000..351e939a93 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c @@ -0,0 +1,10 @@ +#define STRNLEN __strnlen_ia32 +#ifdef SHARED +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); \ + strong_alias (__strnlen_ia32, __strnlen_ia32_1); \ + __hidden_ver1 (__strnlen_ia32_1, __GI___strnlen, __strnlen_ia32_1); +#endif + +#include "string/strnlen.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S new file mode 100644 index 0000000000..56b6ae2a5c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNLEN +#define STRLEN __strnlen_sse2 +#include "strlen-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S new file mode 100644 index 0000000000..d241522c70 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S @@ -0,0 +1,37 @@ +/* Multiple versions of strnlen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__strnlen) + .type __strnlen, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strnlen_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strnlen_sse2) +2: ret +END(__strnlen) + +weak_alias(__strnlen, strnlen) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c new file mode 100644 index 0000000000..5db62053b3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c @@ -0,0 +1,2 @@ +#define __strpbrk_sse2 __strpbrk_ia32 +#include <sysdeps/x86_64/multiarch/strpbrk-c.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S new file mode 100644 index 0000000000..7201d6376f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S @@ -0,0 +1,5 @@ +/* Multiple versions of strpbrk + All versions must be listed in ifunc-impl-list.c. */ +#define STRCSPN strpbrk +#define USE_AS_STRPBRK +#include "strcspn.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S new file mode 100644 index 0000000000..39a7c8825b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S @@ -0,0 +1,282 @@ +/* strrchr with SSE2 with bsf and bsr + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + + .text +ENTRY (__strrchr_sse2_bsf) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + PUSH (%edi) + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + pshufd $0, %xmm1, %xmm1 + ja L(crosscashe) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + + test %eax, %eax + jnz L(unaligned_match1) + + test %edx, %edx + jnz L(return_null) + + and $-16, %edi + add $16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_return_value1): + bsf %edx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_null) + bsr %eax, %eax + add %edi, %eax + POP (%edi) + ret + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_match1): + test %edx, %edx + jnz L(unaligned_return_value1) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + lea 16(%edi), %esi + and $-16, %edi + add $16, %edi + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 + L(crosscashe): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm3, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + add $16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_return_value): + add %ecx, %edi + bsf %edx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_null) + bsr %eax, %eax + add %edi, %eax + POP (%edi) + ret + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(unaligned_return_value) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + add $16, %edi + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jz L(loop) + +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %ebx, %ebx + jz L(return_null_1) + bsr %ebx, %eax + add %esi, %eax + + POP (%ebx) + POP (%esi) + + sub $16, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(return_value_1) + mov %eax, %ebx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(return_value_1): + bsf %ecx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + + bsr %eax, %eax + add %edi, %eax + sub $16, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + CFI_PUSH (%ebx) + CFI_PUSH (%esi) +/* Return NULL. */ + .p2align 4 +L(return_null_1): + POP (%ebx) + POP (%esi) + POP (%edi) + xor %eax, %eax + ret + +END (__strrchr_sse2_bsf) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S new file mode 100644 index 0000000000..20934288be --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S @@ -0,0 +1,708 @@ +/* strrchr SSE2 without bsf and bsr + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi); +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__strrchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + pshufd $0, %xmm1, %xmm1 + ja L(crosscache) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %ecx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match1) + + test %ecx, %ecx + jnz L(return_null) + + and $-16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_match1): + test %ecx, %ecx + jnz L(prolog_find_zero_1) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + mov %edi, %esi + and $-16, %edi + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(crosscache): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm3, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(prolog_find_zero) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jz L(loop) + +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %ebx, %ebx + jz L(return_null_1) + mov %ebx, %eax + mov %esi, %edi + + POP (%ebx) + POP (%esi) + + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(return_null_1): + POP (%ebx) + POP (%esi) + + xor %eax, %eax + RETURN + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(find_zero) + mov %eax, %ebx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(find_zero): + test %cl, %cl + jz L(find_zero_high) + mov %cl, %dl + and $15, %dl + jz L(find_zero_8) + test $0x01, %cl + jnz L(FindZeroExit1) + test $0x02, %cl + jnz L(FindZeroExit2) + test $0x04, %cl + jnz L(FindZeroExit3) + and $1 << 4 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_8): + test $0x10, %cl + jnz L(FindZeroExit5) + test $0x20, %cl + jnz L(FindZeroExit6) + test $0x40, %cl + jnz L(FindZeroExit7) + and $1 << 8 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_high): + mov %ch, %dh + and $15, %dh + jz L(find_zero_high_8) + test $0x01, %ch + jnz L(FindZeroExit9) + test $0x02, %ch + jnz L(FindZeroExit10) + test $0x04, %ch + jnz L(FindZeroExit11) + and $1 << 12 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_high_8): + test $0x10, %ch + jnz L(FindZeroExit13) + test $0x20, %ch + jnz L(FindZeroExit14) + test $0x40, %ch + jnz L(FindZeroExit15) + and $1 << 16 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit1): + and $1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit2): + and $1 << 2 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit3): + and $1 << 3 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit5): + and $1 << 5 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit6): + and $1 << 6 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit7): + and $1 << 7 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit9): + and $1 << 9 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit10): + and $1 << 10 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit11): + and $1 << 11 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit13): + and $1 << 13 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit14): + and $1 << 14 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit15): + and $1 << 15 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + + .p2align 4 +L(match_exit): + test %ah, %ah + jnz L(match_exit_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(match_exit_8) + test $0x08, %al + jnz L(Exit4) + test $0x04, %al + jnz L(Exit3) + test $0x02, %al + jnz L(Exit2) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(match_exit_8): + test $0x80, %al + jnz L(Exit8) + test $0x40, %al + jnz L(Exit7) + test $0x20, %al + jnz L(Exit6) + lea -12(%edi), %eax + RETURN + + .p2align 4 +L(match_exit_high): + mov %ah, %dh + and $15 << 4, %dh + jnz L(match_exit_high_8) + test $0x08, %ah + jnz L(Exit12) + test $0x04, %ah + jnz L(Exit11) + test $0x02, %ah + jnz L(Exit10) + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_exit_high_8): + test $0x80, %ah + jnz L(Exit16) + test $0x40, %ah + jnz L(Exit15) + test $0x20, %ah + jnz L(Exit14) + lea -4(%edi), %eax + RETURN + + .p2align 4 +L(Exit2): + lea -15(%edi), %eax + RETURN + + .p2align 4 +L(Exit3): + lea -14(%edi), %eax + RETURN + + .p2align 4 +L(Exit4): + lea -13(%edi), %eax + RETURN + + .p2align 4 +L(Exit6): + lea -11(%edi), %eax + RETURN + + .p2align 4 +L(Exit7): + lea -10(%edi), %eax + RETURN + + .p2align 4 +L(Exit8): + lea -9(%edi), %eax + RETURN + + .p2align 4 +L(Exit10): + lea -7(%edi), %eax + RETURN + + .p2align 4 +L(Exit11): + lea -6(%edi), %eax + RETURN + + .p2align 4 +L(Exit12): + lea -5(%edi), %eax + RETURN + + .p2align 4 +L(Exit14): + lea -3(%edi), %eax + RETURN + + .p2align 4 +L(Exit15): + lea -2(%edi), %eax + RETURN + + .p2align 4 +L(Exit16): + lea -1(%edi), %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero): + add %ecx, %edi + mov %edx, %ecx +L(prolog_find_zero_1): + test %cl, %cl + jz L(prolog_find_zero_high) + mov %cl, %dl + and $15, %dl + jz L(prolog_find_zero_8) + test $0x01, %cl + jnz L(PrologFindZeroExit1) + test $0x02, %cl + jnz L(PrologFindZeroExit2) + test $0x04, %cl + jnz L(PrologFindZeroExit3) + and $1 << 4 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero_8): + test $0x10, %cl + jnz L(PrologFindZeroExit5) + test $0x20, %cl + jnz L(PrologFindZeroExit6) + test $0x40, %cl + jnz L(PrologFindZeroExit7) + and $1 << 8 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero_high): + mov %ch, %dh + and $15, %dh + jz L(prolog_find_zero_high_8) + test $0x01, %ch + jnz L(PrologFindZeroExit9) + test $0x02, %ch + jnz L(PrologFindZeroExit10) + test $0x04, %ch + jnz L(PrologFindZeroExit11) + and $1 << 12 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero_high_8): + test $0x10, %ch + jnz L(PrologFindZeroExit13) + test $0x20, %ch + jnz L(PrologFindZeroExit14) + test $0x40, %ch + jnz L(PrologFindZeroExit15) + and $1 << 16 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit1): + and $1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit2): + and $1 << 2 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit3): + and $1 << 3 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit5): + and $1 << 5 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit6): + and $1 << 6 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit7): + and $1 << 7 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit9): + and $1 << 9 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit10): + and $1 << 10 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit11): + and $1 << 11 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit13): + and $1 << 13 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit14): + and $1 << 14 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit15): + and $1 << 15 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + +END (__strrchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S new file mode 100644 index 0000000000..d9281eaeae --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S @@ -0,0 +1,57 @@ +/* Multiple versions of strrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(strrchr) + .type strrchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strrchr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf) + HAS_ARCH_FEATURE (Slow_BSF) + jz 2f + LOAD_FUNC_GOT_EAX (__strrchr_sse2) +2: ret +END(strrchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __strrchr_ia32, @function; \ + .globl __strrchr_ia32; \ + .p2align 4; \ + __strrchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strrchr_ia32, .-__strrchr_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strrchr; __GI_strrchr = __strrchr_ia32 +#endif + +#include "../../strrchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c new file mode 100644 index 0000000000..bea09dea71 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c @@ -0,0 +1,2 @@ +#define __strspn_sse2 __strspn_ia32 +#include <sysdeps/x86_64/multiarch/strspn-c.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S new file mode 100644 index 0000000000..1269062381 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S @@ -0,0 +1,56 @@ +/* Multiple versions of strspn + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(strspn) + .type strspn, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strspn_ia32) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (__strspn_sse42) +2: ret +END(strspn) + +# undef ENTRY +# define ENTRY(name) \ + .type __strspn_ia32, @function; \ + .globl __strspn_ia32; \ + .p2align 4; \ +__strspn_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strspn_ia32, .-__strspn_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strspn; __GI_strspn = __strspn_ia32 +#endif + +#include "../../strspn.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c new file mode 100644 index 0000000000..593cfec273 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/test-multiarch.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c new file mode 100644 index 0000000000..7760b966e2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/varshift.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h new file mode 100644 index 0000000000..7c72c70d67 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/varshift.h> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c new file mode 100644 index 0000000000..38d41d04de --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c @@ -0,0 +1,22 @@ +#include <wchar.h> + +#if IS_IN (libc) +# undef libc_hidden_weak +# define libc_hidden_weak(name) + +# undef weak_alias +# define weak_alias(name,alias) + +# ifdef SHARED +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); \ + strong_alias (__wcschr_ia32, __wcschr_ia32_1); \ + __hidden_ver1 (__wcschr_ia32_1, __GI___wcschr, __wcschr_ia32_1); +# endif +#endif + +extern __typeof (wcschr) __wcschr_ia32; + +#define WCSCHR __wcschr_ia32 +#include <wcsmbs/wcschr.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S new file mode 100644 index 0000000000..9ff6c3b8d6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S @@ -0,0 +1,219 @@ +/* wcschr with SSE2, without using bsf instructions + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcschr_sse2) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %eax + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + + and $63, %eax + cmp $48, %eax + ja L(cross_cache) + + movdqu (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + and $-16, %ecx + jmp L(loop) + + .p2align 4 +L(cross_cache): + PUSH (%edi) + mov %ecx, %edi + mov %eax, %ecx + and $-16, %edi + and $15, %ecx + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + jz L(unaligned_no_match) + + add %edi, %ecx + POP (%edi) + + test %edx, %edx + jz L(match_case1) + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_no_match): + mov %edi, %ecx + POP (%edi) + + test %edx, %edx + jnz L(return_null) + + pxor %xmm2, %xmm2 + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + add $16, %ecx + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jz L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %edx + test %eax, %eax + jz L(return_null) + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_case2_4): + mov %ecx, %eax + ret + + .p2align 4 +L(match_higth_case2): + test %dl, %dl + jnz L(return_null) + test $15, %ah + jnz L(match_case2_12) + test $15, %dh + jnz L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(match_case2_12): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_higth_case1) + + test $0x01, %al + jnz L(exit0) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_higth_case1): + test $0x01, %ah + jnz L(exit3) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit0): + mov %ecx, %eax + ret + + .p2align 4 +L(exit3): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + +END (__wcschr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S new file mode 100644 index 0000000000..d3c65a6436 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S @@ -0,0 +1,36 @@ +/* Multiple versions of wcschr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__wcschr) + .type wcschr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcschr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcschr_sse2) +2: ret +END(__wcschr) +weak_alias (__wcschr, wcschr) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c new file mode 100644 index 0000000000..e3337d77e2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c @@ -0,0 +1,14 @@ +#include <wchar.h> + +#define WCSCMP __wcscmp_ia32 +#ifdef SHARED +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__wcscmp_ia32, __GI___wcscmp, __wcscmp_ia32); +#endif +#undef weak_alias +#define weak_alias(name, alias) + +extern __typeof (wcscmp) __wcscmp_ia32; + +#include "wcsmbs/wcscmp.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S new file mode 100644 index 0000000000..a464b58204 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S @@ -0,0 +1,1018 @@ +/* wcscmp with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define ENTRANCE PUSH(%esi); PUSH(%edi) +# define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi); +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + +/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */ + + .text +ENTRY (__wcscmp_sse2) +/* + * This implementation uses SSE to compare up to 16 bytes at a time. +*/ + mov STR1(%esp), %edx + mov STR2(%esp), %eax + + mov (%eax), %ecx + cmp %ecx, (%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + mov 4(%eax), %ecx + cmp %ecx, 4(%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + mov 8(%eax), %ecx + cmp %ecx, 8(%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + mov 12(%eax), %ecx + cmp %ecx, 12(%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + ENTRANCE + add $16, %eax + add $16, %edx + + mov %eax, %esi + mov %edx, %edi + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + mov %al, %ch + mov %dl, %cl + and $63, %eax /* esi alignment in cache line */ + and $63, %edx /* edi alignment in cache line */ + and $15, %cl + jz L(continue_00) + cmp $16, %edx + jb L(continue_0) + cmp $32, %edx + jb L(continue_16) + cmp $48, %edx + jb L(continue_32) + +L(continue_48): + and $15, %ch + jz L(continue_48_00) + cmp $16, %eax + jb L(continue_0_48) + cmp $32, %eax + jb L(continue_16_48) + cmp $48, %eax + jb L(continue_32_48) + + .p2align 4 +L(continue_48_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%edi), %xmm1 + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_48_48) + +L(continue_0): + and $15, %ch + jz L(continue_0_00) + cmp $16, %eax + jb L(continue_0_0) + cmp $32, %eax + jb L(continue_0_16) + cmp $48, %eax + jb L(continue_0_32) + + .p2align 4 +L(continue_0_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + mov 48(%esi), %ecx + cmp %ecx, 48(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 52(%esi), %ecx + cmp %ecx, 52(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 56(%esi), %ecx + cmp %ecx, 56(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 60(%esi), %ecx + cmp %ecx, 60(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + add $64, %esi + add $64, %edi + jmp L(continue_0_48) + + .p2align 4 +L(continue_00): + and $15, %ch + jz L(continue_00_00) + cmp $16, %eax + jb L(continue_00_0) + cmp $32, %eax + jb L(continue_00_16) + cmp $48, %eax + jb L(continue_00_32) + + .p2align 4 +L(continue_00_48): + pcmpeqd (%edi), %xmm0 + mov (%edi), %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(less4_double_words1) + + cmp (%esi), %eax + jne L(nequal) + + mov 4(%edi), %eax + cmp 4(%esi), %eax + jne L(nequal) + + mov 8(%edi), %eax + cmp 8(%esi), %eax + jne L(nequal) + + mov 12(%edi), %eax + cmp 12(%esi), %eax + jne L(nequal) + + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_32): + and $15, %ch + jz L(continue_32_00) + cmp $16, %eax + jb L(continue_0_32) + cmp $32, %eax + jb L(continue_16_32) + cmp $48, %eax + jb L(continue_32_32) + + .p2align 4 +L(continue_32_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 16(%esi), %ecx + cmp %ecx, 16(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 20(%esi), %ecx + cmp %ecx, 20(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 24(%esi), %ecx + cmp %ecx, 24(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 28(%esi), %ecx + cmp %ecx, 28(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%edi), %xmm1 + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results */ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_32_48) + + .p2align 4 +L(continue_16): + and $15, %ch + jz L(continue_16_00) + cmp $16, %eax + jb L(continue_0_16) + cmp $32, %eax + jb L(continue_16_16) + cmp $48, %eax + jb L(continue_16_32) + + .p2align 4 +L(continue_16_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + mov 32(%esi), %ecx + cmp %ecx, 32(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 36(%esi), %ecx + cmp %ecx, 36(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 40(%esi), %ecx + cmp %ecx, 40(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 44(%esi), %ecx + cmp %ecx, 44(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 48(%edi), %xmm1 + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_16_48) + + .p2align 4 +L(continue_00_00): + movdqa (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqa 16(%edi), %xmm3 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqa 32(%edi), %xmm5 + pcmpeqd %xmm5, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%esi), %xmm5 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm5 /* packed sub of comparison results*/ + pmovmskb %xmm5, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqa 48(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_00_00) + + .p2align 4 +L(continue_00_32): + movdqu (%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_00_16): + movdqu (%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_00_0): + movdqu (%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %esi + add $48, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_48_00): + pcmpeqd (%esi), %xmm0 + mov (%edi), %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(less4_double_words1) + + cmp (%esi), %eax + jne L(nequal) + + mov 4(%edi), %eax + cmp 4(%esi), %eax + jne L(nequal) + + mov 8(%edi), %eax + cmp 8(%esi), %eax + jne L(nequal) + + mov 12(%edi), %eax + cmp 12(%esi), %eax + jne L(nequal) + + movdqu 16(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_32_00): + movdqu (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_16_00): + movdqu (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_0_00): + movdqu (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %esi + add $48, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_32_32): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_48_48) + + .p2align 4 +L(continue_16_16): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm3 + movdqu 16(%esi), %xmm4 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_48_48) + + .p2align 4 +L(continue_0_0): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm3 + movdqu 16(%esi), %xmm4 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %esi + add $48, %edi + jmp L(continue_48_48) + + .p2align 4 +L(continue_0_16): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_32_48) + + .p2align 4 +L(continue_0_32): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_16_48) + + .p2align 4 +L(continue_16_32): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_32_48) + + .p2align 4 +L(less4_double_words1): + cmp (%esi), %eax + jne L(nequal) + test %eax, %eax + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + xor %eax, %eax + RETURN + + .p2align 4 +L(less4_double_words): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(less4_double_words_16): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_16) + and $15, %dl + jz L(second_double_word_16) + mov 16(%esi), %ecx + cmp %ecx, 16(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word_16): + mov 20(%esi), %ecx + cmp %ecx, 20(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words_16): + and $15, %dh + jz L(fourth_double_word_16) + mov 24(%esi), %ecx + cmp %ecx, 24(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word_16): + mov 28(%esi), %ecx + cmp %ecx, 28(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(less4_double_words_32): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_32) + and $15, %dl + jz L(second_double_word_32) + mov 32(%esi), %ecx + cmp %ecx, 32(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word_32): + mov 36(%esi), %ecx + cmp %ecx, 36(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words_32): + and $15, %dh + jz L(fourth_double_word_32) + mov 40(%esi), %ecx + cmp %ecx, 40(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word_32): + mov 44(%esi), %ecx + cmp %ecx, 44(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(less4_double_words_48): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_48) + and $15, %dl + jz L(second_double_word_48) + mov 48(%esi), %ecx + cmp %ecx, 48(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word_48): + mov 52(%esi), %ecx + cmp %ecx, 52(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words_48): + and $15, %dh + jz L(fourth_double_word_48) + mov 56(%esi), %ecx + cmp %ecx, 56(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word_48): + mov 60(%esi), %ecx + cmp %ecx, 60(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(return) + neg %eax + RETURN + + .p2align 4 +L(return): + RETURN + + .p2align 4 +L(equal): + xorl %eax, %eax + RETURN + + CFI_POP (%edi) + CFI_POP (%esi) + + .p2align 4 +L(neq): + mov $1, %eax + jg L(neq_bigger) + neg %eax + +L(neq_bigger): + ret + + .p2align 4 +L(eq): + xorl %eax, %eax + ret + +END (__wcscmp_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S new file mode 100644 index 0000000000..7118bdd4db --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S @@ -0,0 +1,39 @@ +/* Multiple versions of wcscmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc and for the + DSO. In static binaries, we need wcscmp before the initialization + happened. */ +#if IS_IN (libc) + .text +ENTRY(__wcscmp) + .type __wcscmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcscmp_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcscmp_sse2) +2: ret +END(__wcscmp) +weak_alias (__wcscmp, wcscmp) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c new file mode 100644 index 0000000000..fb3000392b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c @@ -0,0 +1,5 @@ +#if IS_IN (libc) +# define wcscpy __wcscpy_ia32 +#endif + +#include "wcsmbs/wcscpy.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S new file mode 100644 index 0000000000..6280ba92ab --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S @@ -0,0 +1,600 @@ +/* wcscpy with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define RETURN POP (%edi); ret; CFI_PUSH (%edi) +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + + atom_text_section +ENTRY (__wcscpy_ssse3) + mov STR1(%esp), %edx + mov STR2(%esp), %ecx + + cmp $0, (%ecx) + jz L(ExitTail4) + cmp $0, 4(%ecx) + jz L(ExitTail8) + cmp $0, 8(%ecx) + jz L(ExitTail12) + cmp $0, 12(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + mov %edx, %edi + PUSH (%esi) + lea 16(%ecx), %esi + + and $-16, %esi + + pxor %xmm0, %xmm0 + pcmpeqd (%esi), %xmm0 + movdqu (%ecx), %xmm1 + movdqu %xmm1, (%edx) + + pmovmskb %xmm0, %eax + sub %ecx, %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %edx, %eax + lea 16(%edx), %edx + and $-16, %edx + sub %edx, %eax + + sub %eax, %ecx + mov %ecx, %eax + and $0xf, %eax + mov $0, %esi + + jz L(Align16Both) + cmp $4, %eax + je L(Shl4) + cmp $8, %eax + je L(Shl8) + jmp L(Shl12) + +L(Align16Both): + movaps (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movaps %xmm1, (%edx) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm4 + movaps %xmm3, (%edx, %esi) + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm1 + movaps %xmm4, (%edx, %esi) + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm2 + movaps %xmm1, (%edx, %esi) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%edx, %esi) + mov %ecx, %eax + lea 16(%ecx, %esi), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx + + mov $-0x40, %esi + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps 32(%ecx), %xmm3 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + lea 64(%edx), %edx + pcmpeqd %xmm0, %xmm3 + lea 64(%ecx), %ecx + pmovmskb %xmm3, %eax + + test %eax, %eax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%edx) + movaps %xmm5, -48(%edx) + movaps %xmm6, -32(%edx) + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%edx) + pcmpeqd %xmm7, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + mov $-0x40, %esi + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + + .p2align 4 +L(Shl4): + movaps -4(%ecx), %xmm1 + movaps 12(%ecx), %xmm2 +L(Shl4Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm1 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 28(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -12(%ecx), %ecx + sub %eax, %edx + + movaps -4(%ecx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%ecx), %xmm2 + movaps 28(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %eax, %eax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) + + palignr $4, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + POP (%esi) + add $12, %edx + add $12, %ecx + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(Shl8): + movaps -8(%ecx), %xmm1 + movaps 8(%ecx), %xmm2 +L(Shl8Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm1 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 24(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -8(%ecx), %ecx + sub %eax, %edx + + movaps -8(%ecx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%ecx), %xmm2 + movaps 24(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %eax, %eax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) + + palignr $8, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + POP (%esi) + add $8, %edx + add $8, %ecx + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(Shl12): + movaps -12(%ecx), %xmm1 + movaps 4(%ecx), %xmm2 +L(Shl12Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm1 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 20(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -4(%ecx), %ecx + sub %eax, %edx + + movaps -12(%ecx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%ecx), %xmm2 + movaps 20(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %eax, %eax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) + + palignr $12, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movl (%ecx), %esi + movl %esi, (%edx) + mov $4, %esi + + .p2align 4 +L(CopyFrom1To16Bytes): + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit12) +L(Exit16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edi, %eax + RETURN + +CFI_POP (%edi) + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + movl %edx, %eax + ret + +END (__wcscpy_ssse3) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S new file mode 100644 index 0000000000..cfc97dd87c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S @@ -0,0 +1,36 @@ +/* Multiple versions of wcscpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(wcscpy) + .type wcscpy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcscpy_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__wcscpy_ssse3) +2: ret +END(wcscpy) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c new file mode 100644 index 0000000000..a335dc0f7e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c @@ -0,0 +1,9 @@ +#include <wchar.h> + +#if IS_IN (libc) +# define WCSLEN __wcslen_ia32 +#endif + +extern __typeof (wcslen) __wcslen_ia32; + +#include "wcsmbs/wcslen.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S new file mode 100644 index 0000000000..bd3fc4c79b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S @@ -0,0 +1,193 @@ +/* wcslen with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> +# define STR 4 + + .text +ENTRY (__wcslen_sse2) + mov STR(%esp), %edx + + cmp $0, (%edx) + jz L(exit_tail0) + cmp $0, 4(%edx) + jz L(exit_tail1) + cmp $0, 8(%edx) + jz L(exit_tail2) + cmp $0, 12(%edx) + jz L(exit_tail3) + cmp $0, 16(%edx) + jz L(exit_tail4) + cmp $0, 20(%edx) + jz L(exit_tail5) + cmp $0, 24(%edx) + jz L(exit_tail6) + cmp $0, 28(%edx) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%edx), %eax + lea 16(%edx), %ecx + and $-16, %eax + + pcmpeqd (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + and $-0x40, %eax + + .p2align 4 +L(aligned_64_loop): + movaps (%eax), %xmm0 + movaps 16(%eax), %xmm1 + movaps 32(%eax), %xmm2 + movaps 48(%eax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%eax), %eax + jz L(aligned_64_loop) + + pcmpeqd -64(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%ecx), %ecx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqd -32(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + jmp L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %ecx, %eax + shr $2, %eax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_1) + ret + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_3) + add $2, %eax + ret + + .p2align 4 +L(exit_1): + add $1, %eax + ret + + .p2align 4 +L(exit_3): + add $3, %eax + ret + + .p2align 4 +L(exit_tail0): + xor %eax, %eax + ret + + .p2align 4 +L(exit_tail1): + mov $1, %eax + ret + + .p2align 4 +L(exit_tail2): + mov $2, %eax + ret + + .p2align 4 +L(exit_tail3): + mov $3, %eax + ret + + .p2align 4 +L(exit_tail4): + mov $4, %eax + ret + + .p2align 4 +L(exit_tail5): + mov $5, %eax + ret + + .p2align 4 +L(exit_tail6): + mov $6, %eax + ret + + .p2align 4 +L(exit_tail7): + mov $7, %eax + ret + +END (__wcslen_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S new file mode 100644 index 0000000000..6ef9b6e7b5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S @@ -0,0 +1,37 @@ +/* Multiple versions of wcslen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__wcslen) + .type __wcslen, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcslen_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcslen_sse2) +2: ret +END(__wcslen) + +weak_alias(__wcslen, wcslen) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c new file mode 100644 index 0000000000..8d8a335b5b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c @@ -0,0 +1,5 @@ +#if IS_IN (libc) +# define wcsrchr __wcsrchr_ia32 +#endif + +#include "wcsmbs/wcsrchr.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S new file mode 100644 index 0000000000..1a9b60e55e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S @@ -0,0 +1,354 @@ +/* wcsrchr with SSE2, without using bsf instructions. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH (%edi); +# define RETURN POP (%edi); ret; CFI_PUSH (%edi); +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcsrchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %edi + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + +/* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + ja L(crosscache) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 +/* Find where NULL is. */ + pmovmskb %xmm2, %ecx +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match1) + + test %ecx, %ecx + jnz L(return_null) + + and $-16, %edi + + PUSH (%esi) + + xor %edx, %edx + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(unaligned_match1): + test %ecx, %ecx + jnz L(prolog_find_zero_1) + + PUSH (%esi) + +/* Save current match */ + mov %eax, %edx + mov %edi, %esi + and $-16, %edi + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(crosscache): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm3 + pcmpeqd %xmm1, %xmm0 +/* Find where NULL is. */ + pmovmskb %xmm3, %edx +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + PUSH (%esi) + + xor %edx, %edx + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(prolog_find_zero) + + PUSH (%esi) + + mov %eax, %edx + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm3 + pcmpeqd %xmm3, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm4 + pcmpeqd %xmm4, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm4 + pmovmskb %xmm2, %ecx + pmovmskb %xmm4, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm5 + pcmpeqd %xmm5, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm5 + pmovmskb %xmm2, %ecx + pmovmskb %xmm5, %eax + or %eax, %ecx + jz L(loop) + + .p2align 4 +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %edx, %edx + jz L(return_null_1) + mov %edx, %eax + mov %esi, %edi + + POP (%esi) + + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(return_null_1): + POP (%esi) + + xor %eax, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(find_zero) +/* save match info */ + mov %eax, %edx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(find_zero): + test %cl, %cl + jz L(find_zero_in_third_or_fourth_wchar) + test $15, %cl + jz L(find_zero_in_second_wchar) + and $1, %eax + jz L(return_value) + + POP (%esi) + + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_second_wchar): + and $1 << 5 - 1, %eax + jz L(return_value) + + POP (%esi) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_third_or_fourth_wchar): + test $15, %ch + jz L(find_zero_in_fourth_wchar) + and $1 << 9 - 1, %eax + jz L(return_value) + + POP (%esi) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_fourth_wchar): + + POP (%esi) + + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(match_second_wchar): + lea -12(%edi), %eax + RETURN + + .p2align 4 +L(match_third_or_fourth_wchar): + test $15 << 4, %ah + jnz L(match_fourth_wchar) + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_third_wchar): + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_fourth_wchar): + lea -4(%edi), %eax + RETURN + + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero): + add %ecx, %edi + mov %edx, %ecx +L(prolog_find_zero_1): + test %cl, %cl + jz L(prolog_find_zero_in_third_or_fourth_wchar) + test $15, %cl + jz L(prolog_find_zero_in_second_wchar) + and $1, %eax + jz L(return_null) + + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_second_wchar): + and $1 << 5 - 1, %eax + jz L(return_null) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_third_or_fourth_wchar): + test $15, %ch + jz L(prolog_find_zero_in_fourth_wchar) + and $1 << 9 - 1, %eax + jz L(return_null) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_fourth_wchar): + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + +END (__wcsrchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S new file mode 100644 index 0000000000..cf67333995 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S @@ -0,0 +1,35 @@ +/* Multiple versions of wcsrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(wcsrchr) + .type wcsrchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcsrchr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcsrchr_sse2) +2: ret +END(wcsrchr) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c new file mode 100644 index 0000000000..75ab4b94c1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c @@ -0,0 +1,9 @@ +#include <wchar.h> + +#if IS_IN (libc) +# define WMEMCMP __wmemcmp_ia32 +#endif + +extern __typeof (wmemcmp) __wmemcmp_ia32; + +#include "wcsmbs/wmemcmp.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S new file mode 100644 index 0000000000..1a857c7e21 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse4_2 + +#include "memcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S new file mode 100644 index 0000000000..a41ef95fc1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_ssse3 + +#include "memcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S new file mode 100644 index 0000000000..1b9a54a413 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S @@ -0,0 +1,40 @@ +/* Multiple versions of wmemcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) + .text +ENTRY(wmemcmp) + .type wmemcmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wmemcmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2) +2: ret +END(wmemcmp) +#endif |