diff options
Diffstat (limited to 'REORG.TODO/sysdeps/i386/i686/multiarch')
127 files changed, 32113 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile new file mode 100644 index 0000000000..4a0c20c051 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile @@ -0,0 +1,44 @@ +ifeq ($(subdir),csu) +tests += test-multiarch +endif + +ifeq ($(subdir),string) +gen-as-const-headers += locale-defines.sym +sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ + memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \ + memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ + memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \ + strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ + memcmp-ssse3 memcmp-sse4 varshift \ + strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \ + strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \ + strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \ + strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \ + strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \ + memchr-sse2 memchr-sse2-bsf \ + memrchr-sse2 memrchr-sse2-bsf memrchr-c \ + rawmemchr-sse2 rawmemchr-sse2-bsf \ + strnlen-sse2 strnlen-c \ + strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \ + strncase_l-c strncase-c strncase_l-ssse3 \ + strcasecmp_l-sse4 strncase_l-sse4 \ + bcopy-sse2-unaligned memcpy-sse2-unaligned \ + mempcpy-sse2-unaligned memmove-sse2-unaligned \ + strcspn-c strpbrk-c strspn-c +CFLAGS-varshift.c += -msse4 +CFLAGS-strcspn-c.c += -msse4 +CFLAGS-strpbrk-c.c += -msse4 +CFLAGS-strspn-c.c += -msse4 +endif + +ifeq ($(subdir),wcsmbs) +sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \ + wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \ + wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c +endif + +ifeq ($(subdir),math) +libm-sysdep_routines += s_fma-fma s_fmaf-fma +CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse +CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse +endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S new file mode 100644 index 0000000000..efef2a10dd --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S new file mode 100644 index 0000000000..cbc8b420e8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S new file mode 100644 index 0000000000..36aac44b9c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S new file mode 100644 index 0000000000..877f82c28f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S @@ -0,0 +1,59 @@ +/* Multiple versions of bcopy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(bcopy) + .type bcopy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__bcopy_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__bcopy_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep) +2: ret +END(bcopy) + +# undef ENTRY +# define ENTRY(name) \ + .type __bcopy_ia32, @function; \ + .p2align 4; \ + .globl __bcopy_ia32; \ + .hidden __bcopy_ia32; \ + __bcopy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32 + +#endif + +#include "../bcopy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S new file mode 100644 index 0000000000..507b288bb3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S @@ -0,0 +1,3 @@ +#define USE_AS_BZERO +#define __memset_sse2_rep __bzero_sse2_rep +#include "memset-sse2-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S new file mode 100644 index 0000000000..8d04512e4e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_BZERO +#define __memset_sse2 __bzero_sse2 +#include "memset-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S new file mode 100644 index 0000000000..9dac490aa2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S @@ -0,0 +1,62 @@ +/* Multiple versions of bzero + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(__bzero) + .type __bzero, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__bzero_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX ( __bzero_sse2) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__bzero_sse2_rep) +2: ret +END(__bzero) + +# undef ENTRY +# define ENTRY(name) \ + .type __bzero_ia32, @function; \ + .p2align 4; \ + .globl __bzero_ia32; \ + .hidden __bzero_ia32; \ + __bzero_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __bzero_ia32, .-__bzero_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI___bzero; __GI___bzero = __bzero_ia32 +# endif +#endif + +#include "../bzero.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c new file mode 100644 index 0000000000..e8026a2a78 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c @@ -0,0 +1,376 @@ +/* Enumerate available IFUNC implementations of a function. i686 version. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <assert.h> +#include <string.h> +#include <wchar.h> +#include <ifunc-impl-list.h> +#include "init-arch.h" + +/* Maximum number of IFUNC implementations. */ +#define MAX_IFUNC 4 + +/* Fill ARRAY of MAX elements with IFUNC implementations for function + NAME and return the number of valid entries. */ + +size_t +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t max) +{ + assert (max >= MAX_IFUNC); + + size_t i = 0; + + /* Support sysdeps/i386/i686/multiarch/bcopy.S. */ + IFUNC_IMPL (i, name, bcopy, + IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3), + __bcopy_ssse3_rep) + IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3), + __bcopy_ssse3) + IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2), + __bcopy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/bzero.S. */ + IFUNC_IMPL (i, name, bzero, + IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2), + __bzero_sse2_rep) + IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2), + __bzero_sse2) + IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memchr.S. */ + IFUNC_IMPL (i, name, memchr, + IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2), + __memchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2), + __memchr_sse2) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memcmp.S. */ + IFUNC_IMPL (i, name, memcmp, + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2), + __memcmp_sse4_2) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), + __memcmp_ssse3) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memmove_chk.S. */ + IFUNC_IMPL (i, name, __memmove_chk, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), + __memmove_chk_ssse3_rep) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), + __memmove_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSE2), + __memmove_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memmove.S. */ + IFUNC_IMPL (i, name, memmove, + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), + __memmove_ssse3_rep) + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), + __memmove_ssse3) + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2), + __memmove_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memrchr.S. */ + IFUNC_IMPL (i, name, memrchr, + IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2), + __memrchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2), + __memrchr_sse2) + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memset_chk.S. */ + IFUNC_IMPL (i, name, __memset_chk, + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_CPU_FEATURE (SSE2), + __memset_chk_sse2_rep) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_CPU_FEATURE (SSE2), + __memset_chk_sse2) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memset.S. */ + IFUNC_IMPL (i, name, memset, + IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2), + __memset_sse2_rep) + IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2), + __memset_sse2) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32)) + + /* Support sysdeps/i386/i686/multiarch/rawmemchr.S. */ + IFUNC_IMPL (i, name, rawmemchr, + IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2), + __rawmemchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2), + __rawmemchr_sse2) + IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/stpncpy.S. */ + IFUNC_IMPL (i, name, stpncpy, + IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3), + __stpncpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSE2), + __stpncpy_sse2) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/stpcpy.S. */ + IFUNC_IMPL (i, name, stpcpy, + IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3), + __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSE2), + __stpcpy_sse2) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcasecmp.S. */ + IFUNC_IMPL (i, name, strcasecmp, + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSE4_2), + __strcasecmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSSE3), + __strcasecmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcasecmp_l.S. */ + IFUNC_IMPL (i, name, strcasecmp_l, + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSE4_2), + __strcasecmp_l_sse4_2) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSSE3), + __strcasecmp_l_ssse3) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, + __strcasecmp_l_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcat.S. */ + IFUNC_IMPL (i, name, strcat, + IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3), + __strcat_ssse3) + IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2), + __strcat_sse2) + IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strchr.S. */ + IFUNC_IMPL (i, name, strchr, + IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2), + __strchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2), + __strchr_sse2) + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcmp.S. */ + IFUNC_IMPL (i, name, strcmp, + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2), + __strcmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3), + __strcmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcpy.S. */ + IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3), + __strcpy_ssse3) + IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSE2), + __strcpy_sse2) + IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcspn.S. */ + IFUNC_IMPL (i, name, strcspn, + IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2), + __strcspn_sse42) + IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncase.S. */ + IFUNC_IMPL (i, name, strncasecmp, + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSE4_2), + __strncasecmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSSE3), + __strncasecmp_ssse3) + IFUNC_IMPL_ADD (array, i, strncasecmp, 1, + __strncasecmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncase_l.S. */ + IFUNC_IMPL (i, name, strncasecmp_l, + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSE4_2), + __strncasecmp_l_sse4_2) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSSE3), + __strncasecmp_l_ssse3) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, + __strncasecmp_l_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncat.S. */ + IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3), + __strncat_ssse3) + IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2), + __strncat_sse2) + IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncpy.S. */ + IFUNC_IMPL (i, name, strncpy, + IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3), + __strncpy_ssse3) + IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSE2), + __strncpy_sse2) + IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strnlen.S. */ + IFUNC_IMPL (i, name, strnlen, + IFUNC_IMPL_ADD (array, i, strnlen, HAS_CPU_FEATURE (SSE2), + __strnlen_sse2) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strpbrk.S. */ + IFUNC_IMPL (i, name, strpbrk, + IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2), + __strpbrk_sse42) + IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strrchr.S. */ + IFUNC_IMPL (i, name, strrchr, + IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2), + __strrchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2), + __strrchr_sse2) + IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strspn.S. */ + IFUNC_IMPL (i, name, strspn, + IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2), + __strspn_sse42) + IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcschr.S. */ + IFUNC_IMPL (i, name, wcschr, + IFUNC_IMPL_ADD (array, i, wcschr, HAS_CPU_FEATURE (SSE2), + __wcschr_sse2) + IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcscmp.S. */ + IFUNC_IMPL (i, name, wcscmp, + IFUNC_IMPL_ADD (array, i, wcscmp, HAS_CPU_FEATURE (SSE2), + __wcscmp_sse2) + IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcscpy.S. */ + IFUNC_IMPL (i, name, wcscpy, + IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3), + __wcscpy_ssse3) + IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcslen.S. */ + IFUNC_IMPL (i, name, wcslen, + IFUNC_IMPL_ADD (array, i, wcslen, HAS_CPU_FEATURE (SSE2), + __wcslen_sse2) + IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcsrchr.S. */ + IFUNC_IMPL (i, name, wcsrchr, + IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_CPU_FEATURE (SSE2), + __wcsrchr_sse2) + IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wmemcmp.S. */ + IFUNC_IMPL (i, name, wmemcmp, + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_2), + __wmemcmp_sse4_2) + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3), + __wmemcmp_ssse3) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32)) + +#ifdef SHARED + /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S. */ + IFUNC_IMPL (i, name, __memcpy_chk, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __memcpy_chk_ssse3_rep) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __memcpy_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSE2), + __memcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memcpy.S. */ + IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3_rep) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2), + __memcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S. */ + IFUNC_IMPL (i, name, __mempcpy_chk, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __mempcpy_chk_ssse3_rep) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __mempcpy_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSE2), + __mempcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/mempcpy.S. */ + IFUNC_IMPL (i, name, mempcpy, + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), + __mempcpy_ssse3_rep) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), + __mempcpy_ssse3) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2), + __mempcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strlen.S. */ + IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2), + __strlen_sse2_bsf) + IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2), + __strlen_sse2) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncmp.S. */ + IFUNC_IMPL (i, name, strncmp, + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2), + __strncmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), + __strncmp_ssse3) + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_ia32)) +#endif + + return i; +} diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym new file mode 100644 index 0000000000..aebff9a4f9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym @@ -0,0 +1,11 @@ +#include <locale/localeinfo.h> +#include <langinfo.h> +#include <stddef.h> + +-- + +LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales) +LC_CTYPE +_NL_CTYPE_NONASCII_CASE +LOCALE_DATA_VALUES offsetof (struct __locale_data, values) +SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0]) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S new file mode 100644 index 0000000000..dd316486e6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S @@ -0,0 +1,502 @@ +/* Optimized memchr with sse2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + +# ifndef USE_AS_RAWMEMCHR +# define LEN STR2+4 +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); +# endif + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2_bsf +# endif + + .text +ENTRY (MEMCHR) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + +# ifndef USE_AS_RAWMEMCHR + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null_1) +# endif + mov %ecx, %eax + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %ecx + ja L(crosscache) + + movdqu (%eax), %xmm0 + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %ecx + test %ecx, %ecx + je L(unaligned_no_match_1) +/* Check which byte is a match. */ + bsf %ecx, %ecx + +# ifndef USE_AS_RAWMEMCHR + sub %ecx, %edx + jbe L(return_null_1) +# endif + add %ecx, %eax + ret + + .p2align 4 +L(unaligned_no_match_1): +# ifndef USE_AS_RAWMEMCHR + sub $16, %edx + jbe L(return_null_1) + PUSH (%edi) + lea 16(%eax), %edi + and $15, %eax + and $-16, %edi + add %eax, %edx +# else + lea 16(%eax), %edx + and $-16, %edx +# endif + jmp L(loop_prolog) + + .p2align 4 +L(return_null_1): + xor %eax, %eax + ret + +# ifndef USE_AS_RAWMEMCHR + CFI_POP (%edi) +# endif + + .p2align 4 +L(crosscache): +/* Handle unaligned string. */ + +# ifndef USE_AS_RAWMEMCHR + PUSH (%edi) + mov %eax, %edi + and $15, %ecx + and $-16, %edi + movdqa (%edi), %xmm0 +# else + mov %eax, %edx + and $15, %ecx + and $-16, %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + sub %eax, %edx + jbe L(return_null) + add %edi, %eax + add %ecx, %eax + RETURN +# else + add %edx, %eax + add %ecx, %eax + ret +# endif + + .p2align 4 +L(unaligned_no_match): +# ifndef USE_AS_RAWMEMCHR + /* Calculate the last acceptable address and check for possible + addition overflow by using satured math: + edx = ecx + edx + edx |= -(edx < ecx) */ + add %ecx, %edx + sbb %eax, %eax + or %eax, %edx + sub $16, %edx + jbe L(return_null) + add $16, %edi +# else + add $16, %edx +# endif + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + +# ifndef USE_AS_RAWMEMCHR + test $0x3f, %edi +# else + test $0x3f, %edx +# endif + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm3 +# else + movdqa 48(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + test %eax, %eax + jnz L(matches0) + +# ifndef USE_AS_RAWMEMCHR + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx +# else + and $-64, %edx +# endif + + .p2align 4 +L(align64_loop): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 +# else + movdqa (%edx), %xmm0 + movdqa 16(%edx), %xmm2 + movdqa 32(%edx), %xmm3 + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + + test %eax, %eax + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edi +# else + sub $64, %edx +# endif + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + + pcmpeqb %xmm1, %xmm3 + +# ifndef USE_AS_RAWMEMCHR + pcmpeqb 48(%edi), %xmm1 +# else + pcmpeqb 48(%edx), %xmm1 +# endif + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + lea 48(%edi, %eax), %eax + RETURN +# else + lea 48(%edx, %eax), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %eax, %eax + RETURN + + .p2align 4 +L(exit_loop_32): + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 16(%edi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %eax, %eax + RETURN +# endif + .p2align 4 +L(matches0): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea -16(%eax, %edi), %eax + RETURN +# else + lea -16(%eax, %edx), %eax + ret +# endif + + .p2align 4 +L(matches): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + add %edi, %eax + RETURN +# else + add %edx, %eax + ret +# endif + + .p2align 4 +L(matches16): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea 16(%eax, %edi), %eax + RETURN +# else + lea 16(%eax, %edx), %eax + ret +# endif + + .p2align 4 +L(matches32): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea 32(%eax, %edi), %eax + RETURN +# else + lea 32(%eax, %edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(matches_1): + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + add %edi, %eax + RETURN + + .p2align 4 +L(matches16_1): + sub $16, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 16(%edi, %eax), %eax + RETURN + + .p2align 4 +L(matches32_1): + sub $32, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 32(%edi, %eax), %eax + RETURN + + .p2align 4 +L(matches48_1): + sub $48, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 48(%edi, %eax), %eax + RETURN +# endif + .p2align 4 +L(return_null): + xor %eax, %eax +# ifndef USE_AS_RAWMEMCHR + RETURN +# else + ret +# endif + +END (MEMCHR) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S new file mode 100644 index 0000000000..172d70de13 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S @@ -0,0 +1,709 @@ +/* Optimized memchr with sse2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef USE_AS_RAWMEMCHR +# define ENTRANCE PUSH(%edi); +# define PARMS 8 +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); +# else +# define ENTRANCE +# define PARMS 4 +# endif + +# define STR1 PARMS +# define STR2 STR1+4 + +# ifndef USE_AS_RAWMEMCHR +# define LEN STR2+4 +# endif + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2 +# endif + + atom_text_section +ENTRY (MEMCHR) + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 +# ifndef USE_AS_RAWMEMCHR + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null) +# endif + + punpcklbw %xmm1, %xmm1 +# ifndef USE_AS_RAWMEMCHR + mov %ecx, %edi +# else + mov %ecx, %edx +# endif + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + cmp $48, %ecx + ja L(crosscache) + +# ifndef USE_AS_RAWMEMCHR + movdqu (%edi), %xmm0 +# else + movdqu (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax +# ifndef USE_AS_RAWMEMCHR + jnz L(match_case2_prolog) + + sub $16, %edx + jbe L(return_null) + lea 16(%edi), %edi + and $15, %ecx + and $-16, %edi + add %ecx, %edx +# else + jnz L(match_case1_prolog) + lea 16(%edx), %edx + and $-16, %edx +# endif + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx +# ifndef USE_AS_RAWMEMCHR + and $-16, %edi + movdqa (%edi), %xmm0 +# else + and $-16, %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + sar %cl, %eax + test %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + jnz L(match_case2_prolog1) + /* "ecx" is less than 16. Calculate "edx + ecx - 16" by using + "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void + possible addition overflow. */ + neg %ecx + add $16, %ecx + sub %ecx, %edx + jbe L(return_null) + lea 16(%edi), %edi +# else + jnz L(match_case1_prolog1) + lea 16(%edx), %edx +# endif + + .p2align 4 +L(loop_prolog): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + lea 64(%edi), %edi + sub $64, %edx + jbe L(exit_loop) + + movdqa (%edi), %xmm0 +# else + lea 64(%edx), %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + lea 64(%edi), %edi + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx +# else + lea 64(%edx), %edx + and $-64, %edx +# endif + + .p2align 4 +L(align64_loop): + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 +# else + movdqa (%edx), %xmm0 + movdqa 16(%edx), %xmm2 + movdqa 32(%edx), %xmm3 + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + pmovmskb %xmm4, %eax + + test %eax, %eax + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edi +# else + sub $64, %edx +# endif + + pmovmskb %xmm0, %eax + xor %ecx, %ecx + test %eax, %eax + jnz L(match_case1) + + pmovmskb %xmm2, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + pcmpeqb 48(%edi), %xmm1 +# else + pcmpeqb 48(%edx), %xmm1 +# endif + pmovmskb %xmm1, %eax + lea 16(%ecx), %ecx + + .p2align 4 +L(match_case1): +# ifndef USE_AS_RAWMEMCHR + add %ecx, %edi +# else +L(match_case1_prolog1): + add %ecx, %edx +L(match_case1_prolog): +# endif + test %al, %al + jz L(match_case1_high) + mov %al, %cl + and $15, %cl + jz L(match_case1_8) + test $0x01, %al + jnz L(ExitCase1_1) + test $0x02, %al + jnz L(ExitCase1_2) + test $0x04, %al + jnz L(ExitCase1_3) +# ifndef USE_AS_RAWMEMCHR + lea 3(%edi), %eax + RETURN +# else + lea 3(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_8): + test $0x10, %al + jnz L(ExitCase1_5) + test $0x20, %al + jnz L(ExitCase1_6) + test $0x40, %al + jnz L(ExitCase1_7) +# ifndef USE_AS_RAWMEMCHR + lea 7(%edi), %eax + RETURN +# else + lea 7(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_high): + mov %ah, %ch + and $15, %ch + jz L(match_case1_high_8) + test $0x01, %ah + jnz L(ExitCase1_9) + test $0x02, %ah + jnz L(ExitCase1_10) + test $0x04, %ah + jnz L(ExitCase1_11) +# ifndef USE_AS_RAWMEMCHR + lea 11(%edi), %eax + RETURN +# else + lea 11(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_high_8): + test $0x10, %ah + jnz L(ExitCase1_13) + test $0x20, %ah + jnz L(ExitCase1_14) + test $0x40, %ah + jnz L(ExitCase1_15) +# ifndef USE_AS_RAWMEMCHR + lea 15(%edi), %eax + RETURN +# else + lea 15(%edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(exit_loop): + add $64, %edx + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case2) + cmp $16, %edx + jbe L(return_null) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case2) + cmp $32, %edx + jbe L(return_null) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case2) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + lea 16(%ecx), %ecx + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(match_case2) + + xor %eax, %eax + RETURN +# endif + + .p2align 4 +L(ExitCase1_1): +# ifndef USE_AS_RAWMEMCHR + mov %edi, %eax + RETURN +# else + mov %edx, %eax + ret +# endif + + .p2align 4 +L(ExitCase1_2): +# ifndef USE_AS_RAWMEMCHR + lea 1(%edi), %eax + RETURN +# else + lea 1(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_3): +# ifndef USE_AS_RAWMEMCHR + lea 2(%edi), %eax + RETURN +# else + lea 2(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_5): +# ifndef USE_AS_RAWMEMCHR + lea 4(%edi), %eax + RETURN +# else + lea 4(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_6): +# ifndef USE_AS_RAWMEMCHR + lea 5(%edi), %eax + RETURN +# else + lea 5(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_7): +# ifndef USE_AS_RAWMEMCHR + lea 6(%edi), %eax + RETURN +# else + lea 6(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_9): +# ifndef USE_AS_RAWMEMCHR + lea 8(%edi), %eax + RETURN +# else + lea 8(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_10): +# ifndef USE_AS_RAWMEMCHR + lea 9(%edi), %eax + RETURN +# else + lea 9(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_11): +# ifndef USE_AS_RAWMEMCHR + lea 10(%edi), %eax + RETURN +# else + lea 10(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_13): +# ifndef USE_AS_RAWMEMCHR + lea 12(%edi), %eax + RETURN +# else + lea 12(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_14): +# ifndef USE_AS_RAWMEMCHR + lea 13(%edi), %eax + RETURN +# else + lea 13(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_15): +# ifndef USE_AS_RAWMEMCHR + lea 14(%edi), %eax + RETURN +# else + lea 14(%edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(match_case2): + sub %ecx, %edx +L(match_case2_prolog1): + add %ecx, %edi +L(match_case2_prolog): + test %al, %al + jz L(match_case2_high) + mov %al, %cl + and $15, %cl + jz L(match_case2_8) + test $0x01, %al + jnz L(ExitCase2_1) + test $0x02, %al + jnz L(ExitCase2_2) + test $0x04, %al + jnz L(ExitCase2_3) + sub $4, %edx + jb L(return_null) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_8): + test $0x10, %al + jnz L(ExitCase2_5) + test $0x20, %al + jnz L(ExitCase2_6) + test $0x40, %al + jnz L(ExitCase2_7) + sub $8, %edx + jb L(return_null) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high): + mov %ah, %ch + and $15, %ch + jz L(match_case2_high_8) + test $0x01, %ah + jnz L(ExitCase2_9) + test $0x02, %ah + jnz L(ExitCase2_10) + test $0x04, %ah + jnz L(ExitCase2_11) + sub $12, %edx + jb L(return_null) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high_8): + test $0x10, %ah + jnz L(ExitCase2_13) + test $0x20, %ah + jnz L(ExitCase2_14) + test $0x40, %ah + jnz L(ExitCase2_15) + sub $16, %edx + jb L(return_null) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_1): + mov %edi, %eax + RETURN + + .p2align 4 +L(ExitCase2_2): + sub $2, %edx + jb L(return_null) + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_3): + sub $3, %edx + jb L(return_null) + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_5): + sub $5, %edx + jb L(return_null) + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_6): + sub $6, %edx + jb L(return_null) + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_7): + sub $7, %edx + jb L(return_null) + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_9): + sub $9, %edx + jb L(return_null) + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_10): + sub $10, %edx + jb L(return_null) + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_11): + sub $11, %edx + jb L(return_null) + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_13): + sub $13, %edx + jb L(return_null) + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_14): + sub $14, %edx + jb L(return_null) + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_15): + sub $15, %edx + jb L(return_null) + lea 14(%edi), %eax + RETURN +# endif + + .p2align 4 +L(return_null): + xor %eax, %eax +# ifndef USE_AS_RAWMEMCHR + RETURN +# else + ret +# endif + +END (MEMCHR) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S new file mode 100644 index 0000000000..bd0dace290 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S @@ -0,0 +1,65 @@ +/* Multiple versions of memchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__memchr) + .type __memchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX ( __memchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__memchr_ia32) + ret + +3: LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf) + ret +END(__memchr) + +weak_alias(__memchr, memchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __memchr_ia32, @function; \ + .globl __memchr_ia32; \ + .p2align 4; \ + __memchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memchr_ia32, .-__memchr_ia32 + +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memchr; __GI_memchr = __memchr_ia32 + +#endif +#include "../../memchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S new file mode 100644 index 0000000000..2aa13048b2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S @@ -0,0 +1,1225 @@ +/* memcmp with SSE4.2, wmemcmp with SSE4.2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_2 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1 + 4 +# define LEN BLK2 + 4 +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) + + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +/* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ +/* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ +/* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ +/* We loaded the jump table and adjusted EDX/ESI. Go. */ \ + jmp *%ebx +# else +# define JMPTBL(I, B) I + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.sse4.2,"ax",@progbits +ENTRY (MEMCMP) + movl BLK1(%esp), %eax + movl BLK2(%esp), %edx + movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(return0) +# else + cmp $1, %ecx + jbe L(less1bytes) +# endif + + pxor %xmm0, %xmm0 + cmp $64, %ecx + ja L(64bytesormore) + cmp $8, %ecx + +# ifndef USE_AS_WMEMCMP + PUSH (%ebx) + jb L(less8bytes) +# else + jb L(less8bytes) + PUSH (%ebx) +# endif + + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less8bytes): + mov (%eax), %bl + cmpb (%edx), %bl + jne L(nonzero) + + mov 1(%eax), %bl + cmpb 1(%edx), %bl + jne L(nonzero) + + cmp $2, %ecx + jz L(0bytes) + + mov 2(%eax), %bl + cmpb 2(%edx), %bl + jne L(nonzero) + + cmp $3, %ecx + jz L(0bytes) + + mov 3(%eax), %bl + cmpb 3(%edx), %bl + jne L(nonzero) + + cmp $4, %ecx + jz L(0bytes) + + mov 4(%eax), %bl + cmpb 4(%edx), %bl + jne L(nonzero) + + cmp $5, %ecx + jz L(0bytes) + + mov 5(%eax), %bl + cmpb 5(%edx), %bl + jne L(nonzero) + + cmp $6, %ecx + jz L(0bytes) + + mov 6(%eax), %bl + cmpb 6(%edx), %bl + je L(0bytes) + +L(nonzero): + POP (%ebx) + mov $1, %eax + ja L(above) + neg %eax +L(above): + ret + CFI_PUSH (%ebx) +# endif + + .p2align 4 +L(0bytes): + POP (%ebx) + xor %eax, %eax + ret + +# ifdef USE_AS_WMEMCMP + +/* for wmemcmp, case N == 1 */ + + .p2align 4 +L(less8bytes): + mov (%eax), %ecx + cmp (%edx), %ecx + je L(return0) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + + .p2align 4 +L(return0): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less1bytes): + jb L(0bytesend) + movzbl (%eax), %eax + movzbl (%edx), %edx + sub %edx, %eax + ret + + .p2align 4 +L(0bytesend): + xor %eax, %eax + ret +# endif + .p2align 4 +L(64bytesormore): + PUSH (%ebx) + mov %ecx, %ebx + mov $64, %ecx + sub $64, %ebx +L(64bytesormore_loop): + movdqu (%eax), %xmm1 + movdqu (%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_16diff) + + movdqu 16(%eax), %xmm1 + movdqu 16(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_32diff) + + movdqu 32(%eax), %xmm1 + movdqu 32(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_48diff) + + movdqu 48(%eax), %xmm1 + movdqu 48(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_64diff) + add %ecx, %eax + add %ecx, %edx + sub %ecx, %ebx + jae L(64bytesormore_loop) + add %ebx, %ecx + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + +# ifdef USE_AS_WMEMCMP + +/* Label needs only for table_64bytes filling */ +L(unreal_case): +/* no code here */ + +# endif + .p2align 4 +L(find_16diff): + sub $16, %ecx +L(find_32diff): + sub $16, %ecx +L(find_48diff): + sub $16, %ecx +L(find_64diff): + add %ecx, %edx + add %ecx, %eax + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# else + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + cmp -4(%edx), %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(49bytes): + movdqu -49(%eax), %xmm1 + movdqu -49(%edx), %xmm2 + mov $-49, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%eax), %xmm1 + movdqu -33(%edx), %xmm2 + mov $-33, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(50bytes): + mov $-50, %ebx + movdqu -50(%eax), %xmm1 + movdqu -50(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + mov $-34, %ebx + movdqu -34(%eax), %xmm1 + movdqu -34(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(51bytes): + mov $-51, %ebx + movdqu -51(%eax), %xmm1 + movdqu -51(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + mov $-35, %ebx + movdqu -35(%eax), %xmm1 + movdqu -35(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) +L(1bytes): + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(52bytes): + movdqu -52(%eax), %xmm1 + movdqu -52(%edx), %xmm2 + mov $-52, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%eax), %xmm1 + movdqu -36(%edx), %xmm2 + mov $-36, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%eax), %xmm1 + movdqu -20(%edx), %xmm2 + mov $-20, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(53bytes): + movdqu -53(%eax), %xmm1 + movdqu -53(%edx), %xmm2 + mov $-53, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + mov $-37, %ebx + movdqu -37(%eax), %xmm1 + movdqu -37(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + mov $-21, %ebx + movdqu -21(%eax), %xmm1 + movdqu -21(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(54bytes): + movdqu -54(%eax), %xmm1 + movdqu -54(%edx), %xmm2 + mov $-54, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + mov $-38, %ebx + movdqu -38(%eax), %xmm1 + movdqu -38(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + mov $-22, %ebx + movdqu -22(%eax), %xmm1 + movdqu -22(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(55bytes): + movdqu -55(%eax), %xmm1 + movdqu -55(%edx), %xmm2 + mov $-55, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + mov $-39, %ebx + movdqu -39(%eax), %xmm1 + movdqu -39(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + mov $-23, %ebx + movdqu -23(%eax), %xmm1 + movdqu -23(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(56bytes): + movdqu -56(%eax), %xmm1 + movdqu -56(%edx), %xmm2 + mov $-56, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + mov $-40, %ebx + movdqu -40(%eax), %xmm1 + movdqu -40(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + mov $-24, %ebx + movdqu -24(%eax), %xmm1 + movdqu -24(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(57bytes): + movdqu -57(%eax), %xmm1 + movdqu -57(%edx), %xmm2 + mov $-57, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + mov $-41, %ebx + movdqu -41(%eax), %xmm1 + movdqu -41(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + mov $-25, %ebx + movdqu -25(%eax), %xmm1 + movdqu -25(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(58bytes): + movdqu -58(%eax), %xmm1 + movdqu -58(%edx), %xmm2 + mov $-58, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + mov $-42, %ebx + movdqu -42(%eax), %xmm1 + movdqu -42(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + mov $-26, %ebx + movdqu -26(%eax), %xmm1 + movdqu -26(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(59bytes): + movdqu -59(%eax), %xmm1 + movdqu -59(%edx), %xmm2 + mov $-59, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + mov $-43, %ebx + movdqu -43(%eax), %xmm1 + movdqu -43(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + mov $-27, %ebx + movdqu -27(%eax), %xmm1 + movdqu -27(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(60bytes): + movdqu -60(%eax), %xmm1 + movdqu -60(%edx), %xmm2 + mov $-60, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + mov $-44, %ebx + movdqu -44(%eax), %xmm1 + movdqu -44(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + mov $-28, %ebx + movdqu -28(%eax), %xmm1 + movdqu -28(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -12(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif + jne L(find_diff) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(61bytes): + movdqu -61(%eax), %xmm1 + movdqu -61(%edx), %xmm2 + mov $-61, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + mov $-45, %ebx + movdqu -45(%eax), %xmm1 + movdqu -45(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + mov $-29, %ebx + movdqu -29(%eax), %xmm1 + movdqu -29(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(62bytes): + movdqu -62(%eax), %xmm1 + movdqu -62(%edx), %xmm2 + mov $-62, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + mov $-46, %ebx + movdqu -46(%eax), %xmm1 + movdqu -46(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + mov $-30, %ebx + movdqu -30(%eax), %xmm1 + movdqu -30(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(63bytes): + movdqu -63(%eax), %xmm1 + movdqu -63(%edx), %xmm2 + mov $-63, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + mov $-47, %ebx + movdqu -47(%eax), %xmm1 + movdqu -47(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + mov $-31, %ebx + movdqu -31(%eax), %xmm1 + movdqu -31(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + + .p2align 4 +L(64bytes): + movdqu -64(%eax), %xmm1 + movdqu -64(%edx), %xmm2 + mov $-64, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%eax), %xmm1 + movdqu -48(%edx), %xmm2 + mov $-48, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%eax), %xmm1 + movdqu -32(%edx), %xmm2 + mov $-32, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -16(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -16(%edx), %ecx +# endif + jne L(find_diff) + + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -12(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif + jne L(find_diff) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + mov (%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + mov 4(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + mov 8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + mov 12(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# else + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + cmp (%edx), %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + cmp 4(%edx), %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + cmp 8(%edx), %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + cmp 12(%edx), %ecx + + mov $0, %eax + jne L(find_diff) + RETURN +# endif + + .p2align 4 +L(find_diff): +# ifndef USE_AS_WMEMCMP + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret +# else + POP (%ebx) + mov $1, %eax + jg L(bigger) + neg %eax + ret + + .p2align 4 +L(bigger): + ret +# endif +END (MEMCMP) + + .section .rodata.sse4.2,"a",@progbits + .p2align 2 + .type L(table_64bytes), @object +# ifndef USE_AS_WMEMCMP +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000000..5ebf5a4d73 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -0,0 +1,2157 @@ +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1+4 +# define LEN BLK2+4 +# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + atom_text_section +ENTRY (MEMCMP) + movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(zero) +# endif + + movl BLK1(%esp), %eax + cmp $48, %ecx + movl BLK2(%esp), %edx + jae L(48bytesormore) + +# ifndef USE_AS_WMEMCMP + cmp $1, %ecx + jbe L(less1bytes) +# endif + + PUSH (%ebx) + add %ecx, %edx + add %ecx, %eax + jmp L(less48bytes) + + CFI_POP (%ebx) + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less1bytes): + jb L(zero) + movb (%eax), %cl + cmp (%edx), %cl + je L(zero) + mov $1, %eax + ja L(1bytesend) + neg %eax +L(1bytesend): + ret +# endif + + .p2align 4 +L(zero): + xor %eax, %eax + ret + + .p2align 4 +L(48bytesormore): + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) + cfi_remember_state + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 + movl %eax, %edi + movl %edx, %esi + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%edi), %edi + + sub $0xffff, %edx + lea 16(%esi), %esi + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %edx, %edi + sub %edx, %esi + add %edx, %ecx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %edx, %esi + +# ifndef USE_AS_WMEMCMP + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + .p2align 2 +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif + + .p2align 4 +L(shr_0): + cmp $80, %ecx + jae L(shr_0_gobble) + lea -48(%ecx), %ecx + xor %eax, %eax + movaps (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + movaps 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + add $32, %edi + add $32, %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_0_gobble): + lea -48(%ecx), %ecx + movdqa (%esi), %xmm0 + xor %eax, %eax + pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %ecx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%esi), %xmm0 + movdqa 48(%esi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%edi), %xmm0 + pcmpeqb 48(%edi), %xmm2 + lea 32(%edi), %edi + lea 32(%esi), %esi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %ecx + jge L(shr_0_gobble_loop_next) + inc %edx + add $32, %ecx +L(shr_0_gobble_loop_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_1): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $1,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_1_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $1,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $1,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $1,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $1,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_1_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_1_gobble_next) + inc %edx + add $32, %ecx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_2): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $2,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_2_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $2,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $2,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $2,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $2,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_2_gobble_next) + inc %edx + add $32, %ecx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_3): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $3,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_3_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $3,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $3,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $3,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $3,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_3_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_3_gobble_next) + inc %edx + add $32, %ecx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_4): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $4,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_4_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $4,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $4,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $4,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $4,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_4_gobble_next) + inc %edx + add $32, %ecx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_5): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $5,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_5_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $5,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $5,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $5,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $5,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_5_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_5_gobble_next) + inc %edx + add $32, %ecx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_6): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $6,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_6_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $6,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $6,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $6,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $6,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_6_gobble_next) + inc %edx + add $32, %ecx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_7): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $7,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_7_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $7,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $7,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $7,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $7,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_7_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_7_gobble_next) + inc %edx + add $32, %ecx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_8): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $8,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_8_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $8,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $8,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $8,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $8,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_8_gobble_next) + inc %edx + add $32, %ecx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_9): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $9,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_9_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $9,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $9,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $9,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $9,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_9_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_9_gobble_next) + inc %edx + add $32, %ecx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_10): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $10,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_10_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $10, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $10, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $10,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $10,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_10_gobble_next) + inc %edx + add $32, %ecx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_11): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_11_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $11, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $11, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $11,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $11,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_11_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_11_gobble_next) + inc %edx + add $32, %ecx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_12): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_12_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $12, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $12, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $12,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $12,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_12_gobble_next) + inc %edx + add $32, %ecx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_13): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_13_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $13, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $13, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $13,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $13,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_13_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_13_gobble_next) + inc %edx + add $32, %ecx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_14): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_14_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $14, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $14, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $14,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $14,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_14_gobble_next) + inc %edx + add $32, %ecx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_15): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_15_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $15, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $15, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $15,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $15,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_15_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_15_gobble_next) + inc %edx + add $32, %ecx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(exit): + pmovmskb %xmm1, %ebx + sub $0xffff, %ebx + jz L(first16bytes) + lea -16(%esi), %esi + lea -16(%edi), %edi + mov %ebx, %edx + +L(first16bytes): + add %eax, %esi +L(less16bytes): + +# ifndef USE_AS_WMEMCMP + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) +L(Byte23): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte16): + movzbl -16(%edi), %eax + movzbl -16(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte17): + movzbl -15(%edi), %eax + movzbl -15(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte18): + movzbl -14(%edi), %eax + movzbl -14(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte19): + movzbl -13(%edi), %eax + movzbl -13(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte20): + movzbl -12(%edi), %eax + movzbl -12(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte21): + movzbl -11(%edi), %eax + movzbl -11(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte22): + movzbl -10(%edi), %eax + movzbl -10(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(next_24_bytes): + lea 8(%edi), %edi + lea 8(%esi), %esi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + .p2align 4 +L(Byte31): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN_END +# else + +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%edi), %eax + cmp -16(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov -12(%edi), %eax + cmp -12(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%edi), %eax + cmp -8(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov -4(%edi), %eax + cmp -4(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(nequal_bigger) + neg %eax + RETURN + + .p2align 4 +L(nequal_bigger): + RETURN_END +# endif + + CFI_PUSH (%ebx) + + .p2align 4 +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) +# ifndef USE_AS_WMEMCMP + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) +# else + jmp L(12bytes) +# endif + + .p2align 4 +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) +# ifndef USE_AS_WMEMCMP + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) +# else + jmp L(20bytes) +# endif + + .p2align 4 +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) +# ifndef USE_AS_WMEMCMP + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) +# else + jmp L(28bytes) +# endif + + .p2align 4 +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) +# ifndef USE_AS_WMEMCMP + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) +# ifndef USE_AS_WMEMCMP + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif + + .p2align 4 +L(more40bytes): + cmp $40, %ecx + je L(40bytes) +# ifndef USE_AS_WMEMCMP + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + mov -44(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + mov -40(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + mov -36(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + mov -32(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + mov -28(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + mov -24(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + mov -20(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# else + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + cmp -44(%edx), %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + cmp -40(%edx), %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + cmp -36(%edx), %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + cmp -32(%edx), %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + cmp -28(%edx), %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + cmp -24(%edx), %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + cmp -20(%edx), %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + xor %eax, %eax + cmp -4(%edx), %ecx + jne L(find_diff) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# endif + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(45bytes): + mov -45(%eax), %ecx + mov -45(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(41bytes): + mov -41(%eax), %ecx + mov -41(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(37bytes): + mov -37(%eax), %ecx + mov -37(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(33bytes): + mov -33(%eax), %ecx + mov -33(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(29bytes): + mov -29(%eax), %ecx + mov -29(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(25bytes): + mov -25(%eax), %ecx + mov -25(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(21bytes): + mov -21(%eax), %ecx + mov -21(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(46bytes): + mov -46(%eax), %ecx + mov -46(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(42bytes): + mov -42(%eax), %ecx + mov -42(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(38bytes): + mov -38(%eax), %ecx + mov -38(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(34bytes): + mov -34(%eax), %ecx + mov -34(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(30bytes): + mov -30(%eax), %ecx + mov -30(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(26bytes): + mov -26(%eax), %ecx + mov -26(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(22bytes): + mov -22(%eax), %ecx + mov -22(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(47bytes): + movl -47(%eax), %ecx + movl -47(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(43bytes): + movl -43(%eax), %ecx + movl -43(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(39bytes): + movl -39(%eax), %ecx + movl -39(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(35bytes): + movl -35(%eax), %ecx + movl -35(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(31bytes): + movl -31(%eax), %ecx + movl -31(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(27bytes): + movl -27(%eax), %ecx + movl -27(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(23bytes): + movl -23(%eax), %ecx + movl -23(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(find_diff): + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx + + .p2align 4 +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret +# else + +/* for wmemcmp */ + .p2align 4 +L(find_diff): + POP (%ebx) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + +# endif +END (MEMCMP) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S new file mode 100644 index 0000000000..1fc5994a17 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S @@ -0,0 +1,62 @@ +/* Multiple versions of memcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memcmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memcmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (__memcmp_sse4_2) +2: ret +END(memcmp) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcmp_ia32, @function; \ + .p2align 4; \ + .globl __memcmp_ia32; \ + .hidden __memcmp_ia32; \ + __memcmp_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcmp; __GI_memcmp = __memcmp_ia32 +# endif +#endif + +#include "../memcmp.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S new file mode 100644 index 0000000000..2fe2072cb1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S @@ -0,0 +1,681 @@ +/* memcpy optimized with SSE2 unaligned memory access instructions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +# include <sysdep.h> +# include "asm-syntax.h" + +# ifndef MEMCPY +# define MEMCPY __memcpy_sse2_unaligned +# define MEMCPY_CHK __memcpy_chk_sse2_unaligned +# endif + +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) + + .section .text.sse2,"ax",@progbits +# if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +# endif + +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + cmp %edx, %eax + +# ifdef USE_AS_MEMMOVE + jg L(check_forward) + +L(mm_len_0_or_more_backward): +/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] + separately. */ + cmp $16, %ecx + jbe L(mm_len_0_16_bytes_backward) + + cmpl $32, %ecx + jg L(mm_len_32_or_more_backward) + +/* Copy [0..32] and return. */ + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_32_or_more_backward): + cmpl $64, %ecx + jg L(mm_len_64_or_more_backward) + +/* Copy [0..64] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu -16(%eax, %ecx), %xmm2 + movdqu -32(%eax, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -16(%edx, %ecx) + movdqu %xmm3, -32(%edx, %ecx) + jmp L(return) + +L(mm_len_64_or_more_backward): + cmpl $128, %ecx + jg L(mm_len_128_or_more_backward) + +/* Copy [0..128] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_128_or_more_backward): + add %ecx, %eax + cmp %edx, %eax + movl SRC(%esp), %eax + jle L(forward) + PUSH (%esi) + PUSH (%edi) + PUSH (%ebx) + +/* Aligning the address of destination. */ + movdqu (%eax), %xmm4 + movdqu 16(%eax), %xmm5 + movdqu 32(%eax), %xmm6 + movdqu 48(%eax), %xmm7 + leal (%edx, %ecx), %esi + movdqu -16(%eax, %ecx), %xmm0 + subl $16, %esp + movdqu %xmm0, (%esp) + mov %ecx, %edi + movl %esi, %ecx + andl $-16, %ecx + leal (%ecx), %ebx + subl %edx, %ebx + leal (%eax, %ebx), %eax + shrl $6, %ebx + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %edi +# else +# ifdef SHARED + PUSH (%ebx) + SETUP_PIC_REG (bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi + POP (%ebx) +# else + cmp __x86_shared_cache_size_half, %edi +# endif +# endif + jae L(mm_large_page_loop_backward) + + .p2align 4 +L(mm_main_loop_backward): + + prefetcht0 -128(%eax) + + movdqu -64(%eax), %xmm0 + movdqu -48(%eax), %xmm1 + movdqu -32(%eax), %xmm2 + movdqu -16(%eax), %xmm3 + movaps %xmm0, -64(%ecx) + subl $64, %eax + movaps %xmm1, -48(%ecx) + movaps %xmm2, -32(%ecx) + movaps %xmm3, -16(%ecx) + subl $64, %ecx + sub $1, %ebx + jnz L(mm_main_loop_backward) + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, -16(%esi) + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + POP (%ebx) + jmp L(mm_return_pop_all) + +/* Copy [0..16] and return. */ +L(mm_len_0_16_bytes_backward): + testb $24, %cl + jnz L(mm_len_9_16_bytes_backward) + testb $4, %cl + .p2align 4,,5 + jnz L(mm_len_5_8_bytes_backward) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + testb $2, %cl + .p2align 4,,1 + jne L(mm_len_3_4_bytes_backward) + movzbl -1(%eax,%ecx), %ebx + movzbl (%eax), %eax + movb %bl, -1(%edx,%ecx) + movb %al, (%edx) + jmp L(return) + +L(mm_len_3_4_bytes_backward): + movzwl -2(%eax,%ecx), %ebx + movzwl (%eax), %eax + movw %bx, -2(%edx,%ecx) + movw %ax, (%edx) + jmp L(return) + +L(mm_len_9_16_bytes_backward): + PUSH (%esi) + movl -4(%eax,%ecx), %ebx + movl -8(%eax,%ecx), %esi + movl %ebx, -4(%edx,%ecx) + movl %esi, -8(%edx,%ecx) + subl $8, %ecx + POP (%esi) + jmp L(mm_len_0_16_bytes_backward) + +L(mm_len_5_8_bytes_backward): + movl (%eax), %ebx + movl -4(%eax,%ecx), %eax + movl %ebx, (%edx) + movl %eax, -4(%edx,%ecx) + jmp L(return) + +/* Big length copy backward part. */ + .p2align 4 +L(mm_large_page_loop_backward): + movdqu -64(%eax), %xmm0 + movdqu -48(%eax), %xmm1 + movdqu -32(%eax), %xmm2 + movdqu -16(%eax), %xmm3 + movntdq %xmm0, -64(%ecx) + subl $64, %eax + movntdq %xmm1, -48(%ecx) + movntdq %xmm2, -32(%ecx) + movntdq %xmm3, -16(%ecx) + subl $64, %ecx + sub $1, %ebx + jnz L(mm_large_page_loop_backward) + sfence + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, -16(%esi) + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + POP (%ebx) + jmp L(mm_return_pop_all) + +L(check_forward): + add %edx, %ecx + cmp %eax, %ecx + movl LEN(%esp), %ecx + jle L(forward) + +/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] + separately. */ + cmp $16, %ecx + jbe L(mm_len_0_16_bytes_forward) + + cmpl $32, %ecx + ja L(mm_len_32_or_more_forward) + +/* Copy [0..32] and return. */ + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_32_or_more_forward): + cmpl $64, %ecx + ja L(mm_len_64_or_more_forward) + +/* Copy [0..64] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu -16(%eax, %ecx), %xmm2 + movdqu -32(%eax, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -16(%edx, %ecx) + movdqu %xmm3, -32(%edx, %ecx) + jmp L(return) + +L(mm_len_64_or_more_forward): + cmpl $128, %ecx + ja L(mm_len_128_or_more_forward) + +/* Copy [0..128] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_128_or_more_forward): + PUSH (%esi) + PUSH (%edi) + PUSH (%ebx) + +/* Aligning the address of destination. */ + movdqu -16(%eax, %ecx), %xmm4 + movdqu -32(%eax, %ecx), %xmm5 + movdqu -48(%eax, %ecx), %xmm6 + movdqu -64(%eax, %ecx), %xmm7 + leal (%edx, %ecx), %esi + movdqu (%eax), %xmm0 + subl $16, %esp + movdqu %xmm0, (%esp) + mov %ecx, %edi + leal 16(%edx), %ecx + andl $-16, %ecx + movl %ecx, %ebx + subl %edx, %ebx + addl %ebx, %eax + movl %esi, %ebx + subl %ecx, %ebx + shrl $6, %ebx + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %edi +# else +# ifdef SHARED + PUSH (%ebx) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi + POP (%ebx) +# else + cmp __x86_shared_cache_size_half, %edi +# endif +# endif + jae L(mm_large_page_loop_forward) + + .p2align 4 +L(mm_main_loop_forward): + + prefetcht0 128(%eax) + + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqa %xmm0, (%ecx) + addl $64, %eax + movaps %xmm1, 16(%ecx) + movaps %xmm2, 32(%ecx) + movaps %xmm3, 48(%ecx) + addl $64, %ecx + sub $1, %ebx + jnz L(mm_main_loop_forward) + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, (%edx) + movdqu %xmm4, -16(%esi) + movdqu %xmm5, -32(%esi) + movdqu %xmm6, -48(%esi) + movdqu %xmm7, -64(%esi) + POP (%ebx) + jmp L(mm_return_pop_all) + +L(mm_len_0_16_bytes_forward): + testb $24, %cl + jne L(mm_len_9_16_bytes_forward) + testb $4, %cl + .p2align 4,,5 + jne L(mm_len_5_8_bytes_forward) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + testb $2, %cl + .p2align 4,,1 + jne L(mm_len_2_4_bytes_forward) + movzbl -1(%eax,%ecx), %ebx + movzbl (%eax), %eax + movb %bl, -1(%edx,%ecx) + movb %al, (%edx) + jmp L(return) + +L(mm_len_2_4_bytes_forward): + movzwl -2(%eax,%ecx), %ebx + movzwl (%eax), %eax + movw %bx, -2(%edx,%ecx) + movw %ax, (%edx) + jmp L(return) + +L(mm_len_5_8_bytes_forward): + movl (%eax), %ebx + movl -4(%eax,%ecx), %eax + movl %ebx, (%edx) + movl %eax, -4(%edx,%ecx) + jmp L(return) + +L(mm_len_9_16_bytes_forward): + movq (%eax), %xmm0 + movq -8(%eax, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(return) + +L(mm_return_pop_all): + movl %edx, %eax + POP (%edi) + POP (%esi) + RETURN + +/* Big length copy forward part. */ + .p2align 4 +L(mm_large_page_loop_forward): + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movntdq %xmm0, (%ecx) + addl $64, %eax + movntdq %xmm1, 16(%ecx) + movntdq %xmm2, 32(%ecx) + movntdq %xmm3, 48(%ecx) + addl $64, %ecx + sub $1, %ebx + jnz L(mm_large_page_loop_forward) + sfence + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, (%edx) + movdqu %xmm4, -16(%esi) + movdqu %xmm5, -32(%esi) + movdqu %xmm6, -48(%esi) + movdqu %xmm7, -64(%esi) + POP (%ebx) + jmp L(mm_return_pop_all) +# endif + +L(forward): + cmp $16, %ecx + jbe L(len_0_16_bytes) + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +# endif + jae L(large_page) + + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + cmpl $32, %ecx + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jbe L(return) + + movdqu 16(%eax), %xmm0 + movdqu -32(%eax, %ecx), %xmm1 + cmpl $64, %ecx + movdqu %xmm0, 16(%edx) + movdqu %xmm1, -32(%edx, %ecx) + jbe L(return) + + movdqu 32(%eax), %xmm0 + movdqu 48(%eax), %xmm1 + movdqu -48(%eax, %ecx), %xmm2 + movdqu -64(%eax, %ecx), %xmm3 + cmpl $128, %ecx + movdqu %xmm0, 32(%edx) + movdqu %xmm1, 48(%edx) + movdqu %xmm2, -48(%edx, %ecx) + movdqu %xmm3, -64(%edx, %ecx) + jbe L(return) + +/* Now the main loop: we align the address of the destination. */ + leal 64(%edx), %ebx + andl $-64, %ebx + + addl %edx, %ecx + andl $-64, %ecx + + subl %edx, %eax + +/* We should stop two iterations before the termination + (in order not to misprefetch). */ + subl $64, %ecx + cmpl %ebx, %ecx + je L(main_loop_just_one_iteration) + + subl $64, %ecx + cmpl %ebx, %ecx + je L(main_loop_last_two_iterations) + + .p2align 4 +L(main_loop_cache): + + prefetcht0 128(%ebx, %eax) + + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + lea 64(%ebx), %ebx + cmpl %ebx, %ecx + jne L(main_loop_cache) + +L(main_loop_last_two_iterations): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqu 64(%ebx, %eax), %xmm4 + movdqu 80(%ebx, %eax), %xmm5 + movdqu 96(%ebx, %eax), %xmm6 + movdqu 112(%ebx, %eax), %xmm7 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + movaps %xmm4, 64(%ebx) + movaps %xmm5, 80(%ebx) + movaps %xmm6, 96(%ebx) + movaps %xmm7, 112(%ebx) + jmp L(return) + +L(main_loop_just_one_iteration): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + jmp L(return) + +L(large_page): + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + + movdqu 64(%eax), %xmm0 + movdqu 80(%eax), %xmm1 + movdqu 96(%eax), %xmm2 + movdqu 112(%eax), %xmm3 + movdqu -128(%eax, %ecx), %xmm4 + movdqu -112(%eax, %ecx), %xmm5 + movdqu -96(%eax, %ecx), %xmm6 + movdqu -80(%eax, %ecx), %xmm7 + movdqu %xmm0, 64(%edx) + movdqu %xmm1, 80(%edx) + movdqu %xmm2, 96(%edx) + movdqu %xmm3, 112(%edx) + movdqu %xmm4, -128(%edx, %ecx) + movdqu %xmm5, -112(%edx, %ecx) + movdqu %xmm6, -96(%edx, %ecx) + movdqu %xmm7, -80(%edx, %ecx) + +/* Now the main loop with non temporal stores. We align + the address of the destination. */ + leal 128(%edx), %ebx + andl $-128, %ebx + + addl %edx, %ecx + andl $-128, %ecx + + subl %edx, %eax + + .p2align 4 +L(main_loop_large_page): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqu 64(%ebx, %eax), %xmm4 + movdqu 80(%ebx, %eax), %xmm5 + movdqu 96(%ebx, %eax), %xmm6 + movdqu 112(%ebx, %eax), %xmm7 + movntdq %xmm0, (%ebx) + movntdq %xmm1, 16(%ebx) + movntdq %xmm2, 32(%ebx) + movntdq %xmm3, 48(%ebx) + movntdq %xmm4, 64(%ebx) + movntdq %xmm5, 80(%ebx) + movntdq %xmm6, 96(%ebx) + movntdq %xmm7, 112(%ebx) + lea 128(%ebx), %ebx + cmpl %ebx, %ecx + jne L(main_loop_large_page) + sfence + jmp L(return) + +L(len_0_16_bytes): + testb $24, %cl + jne L(len_9_16_bytes) + testb $4, %cl + .p2align 4,,5 + jne L(len_5_8_bytes) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + movzbl (%eax), %ebx + testb $2, %cl + movb %bl, (%edx) + je L(return) + movzwl -2(%eax,%ecx), %ebx + movw %bx, -2(%edx,%ecx) + jmp L(return) + +L(len_9_16_bytes): + movq (%eax), %xmm0 + movq -8(%eax, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(return) + +L(len_5_8_bytes): + movl (%eax), %ebx + movl %ebx, (%edx) + movl -4(%eax,%ecx), %ebx + movl %ebx, -4(%edx,%ecx) + +L(return): + movl %edx, %eax +# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif + RETURN + +END (MEMCPY) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S new file mode 100644 index 0000000000..687e083147 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S @@ -0,0 +1,1809 @@ +/* memcpy with SSSE3 and REP string. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3_rep +# define MEMCPY_CHK __memcpy_chk_ssse3_rep +#endif + +#ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +#else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ + addl $(TABLE - .), %ebx + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +#else +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +#endif + + .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +#ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $48, %ecx + jb L(bk_write_less48bytes) + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +#endif + cmp $48, %ecx + jae L(48bytesormore) + +L(fwd_write_less32bytes): +#ifndef USE_AS_MEMMOVE + cmp %dl, %al + jb L(bk_write) +#endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +#ifndef USE_AS_MEMMOVE +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +#endif + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(48bytesormore): + movdqu (%eax), %xmm0 + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + PUSH (%esi) + cfi_remember_state + add $16, %edx + movl %edi, %esi + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +#ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +#endif + + mov %eax, %edi + jae L(large_page) + and $0xf, %edi + jz L(shl_0) + + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + ALIGN (4) +L(shl_0): + movdqu %xmm0, (%esi) + xor %edi, %edi + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state +L(shl_0_gobble): + +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi +# else + mov __x86_data_cache_size_half, %edi +# endif +#endif + mov %edi, %esi + shr $3, %esi + sub %esi, %edi + cmp %edi, %ecx + jae L(shl_0_gobble_mem_start) + sub $128, %ecx + ALIGN (4) +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movaps 0x10(%eax), %xmm1 + movaps 0x20(%eax), %xmm2 + movaps 0x30(%eax), %xmm3 + movaps 0x40(%eax), %xmm4 + movaps 0x50(%eax), %xmm5 + movaps 0x60(%eax), %xmm6 + movaps 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm1, 0x10(%edx) + movaps %xmm2, 0x20(%edx) + movaps %xmm3, 0x30(%edx) + movaps %xmm4, 0x40(%edx) + movaps %xmm5, 0x50(%edx) + movaps %xmm6, 0x60(%edx) + movaps %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_cache_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_0_gobble_mem_start): + cmp %al, %dl + je L(copy_page_by_rep) + sub $128, %ecx +L(shl_0_gobble_mem_loop): + prefetchnta 0x1c0(%eax) + prefetchnta 0x280(%eax) + prefetchnta 0x1c0(%edx) + prefetchnta 0x280(%edx) + + movdqa (%eax), %xmm0 + movaps 0x10(%eax), %xmm1 + movaps 0x20(%eax), %xmm2 + movaps 0x30(%eax), %xmm3 + movaps 0x40(%eax), %xmm4 + movaps 0x50(%eax), %xmm5 + movaps 0x60(%eax), %xmm6 + movaps 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movaps %xmm1, 0x10(%edx) + movaps %xmm2, 0x20(%edx) + movaps %xmm3, 0x30(%edx) + movaps %xmm4, 0x40(%edx) + movaps %xmm5, 0x50(%edx) + movaps %xmm6, 0x60(%edx) + movaps %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_mem_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_1): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $1, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_1_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_1_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_1_loop) + +L(shl_1_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_2): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $2, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_2_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_2_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_2_loop) + +L(shl_2_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_3): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $3, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_3_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_3_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_3_loop) + +L(shl_3_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_4): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $4, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_4_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_4_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_4_loop) + +L(shl_4_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_5): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $5, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_5_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_5_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_5_loop) + +L(shl_5_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_6): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $6, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_6_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_6_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_6_loop) + +L(shl_6_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_7): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $7, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_7_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_7_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_7_loop) + +L(shl_7_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_8): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $8, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_8_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_8_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_8_loop) + +L(shl_8_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_9): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $9, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_9_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_9_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_9_loop) + +L(shl_9_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_10): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $10, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_10_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_10_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_10_loop) + +L(shl_10_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_11): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $11, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_11_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_11_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_11_loop) + +L(shl_11_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_12): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $12, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_12_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_12_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_12_loop) + +L(shl_12_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_13): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $13, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_13_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_13_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_13_loop) + +L(shl_13_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_14): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $14, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_14_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_14_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_14_loop) + +L(shl_14_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_15): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $15, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_15_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_15_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_15_loop) + +L(shl_15_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(fwd_write_44bytes): + movl -44(%eax), %ecx + movl %ecx, -44(%edx) +L(fwd_write_40bytes): + movl -40(%eax), %ecx + movl %ecx, -40(%edx) +L(fwd_write_36bytes): + movl -36(%eax), %ecx + movl %ecx, -36(%edx) +L(fwd_write_32bytes): + movl -32(%eax), %ecx + movl %ecx, -32(%edx) +L(fwd_write_28bytes): + movl -28(%eax), %ecx + movl %ecx, -28(%edx) +L(fwd_write_24bytes): + movl -24(%eax), %ecx + movl %ecx, -24(%edx) +L(fwd_write_20bytes): + movl -20(%eax), %ecx + movl %ecx, -20(%edx) +L(fwd_write_16bytes): + movl -16(%eax), %ecx + movl %ecx, -16(%edx) +L(fwd_write_12bytes): + movl -12(%eax), %ecx + movl %ecx, -12(%edx) +L(fwd_write_8bytes): + movl -8(%eax), %ecx + movl %ecx, -8(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +L(fwd_write_0bytes): +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_45bytes): + movl -45(%eax), %ecx + movl %ecx, -45(%edx) +L(fwd_write_41bytes): + movl -41(%eax), %ecx + movl %ecx, -41(%edx) +L(fwd_write_37bytes): + movl -37(%eax), %ecx + movl %ecx, -37(%edx) +L(fwd_write_33bytes): + movl -33(%eax), %ecx + movl %ecx, -33(%edx) +L(fwd_write_29bytes): + movl -29(%eax), %ecx + movl %ecx, -29(%edx) +L(fwd_write_25bytes): + movl -25(%eax), %ecx + movl %ecx, -25(%edx) +L(fwd_write_21bytes): + movl -21(%eax), %ecx + movl %ecx, -21(%edx) +L(fwd_write_17bytes): + movl -17(%eax), %ecx + movl %ecx, -17(%edx) +L(fwd_write_13bytes): + movl -13(%eax), %ecx + movl %ecx, -13(%edx) +L(fwd_write_9bytes): + movl -9(%eax), %ecx + movl %ecx, -9(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_46bytes): + movl -46(%eax), %ecx + movl %ecx, -46(%edx) +L(fwd_write_42bytes): + movl -42(%eax), %ecx + movl %ecx, -42(%edx) +L(fwd_write_38bytes): + movl -38(%eax), %ecx + movl %ecx, -38(%edx) +L(fwd_write_34bytes): + movl -34(%eax), %ecx + movl %ecx, -34(%edx) +L(fwd_write_30bytes): + movl -30(%eax), %ecx + movl %ecx, -30(%edx) +L(fwd_write_26bytes): + movl -26(%eax), %ecx + movl %ecx, -26(%edx) +L(fwd_write_22bytes): + movl -22(%eax), %ecx + movl %ecx, -22(%edx) +L(fwd_write_18bytes): + movl -18(%eax), %ecx + movl %ecx, -18(%edx) +L(fwd_write_14bytes): + movl -14(%eax), %ecx + movl %ecx, -14(%edx) +L(fwd_write_10bytes): + movl -10(%eax), %ecx + movl %ecx, -10(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_47bytes): + movl -47(%eax), %ecx + movl %ecx, -47(%edx) +L(fwd_write_43bytes): + movl -43(%eax), %ecx + movl %ecx, -43(%edx) +L(fwd_write_39bytes): + movl -39(%eax), %ecx + movl %ecx, -39(%edx) +L(fwd_write_35bytes): + movl -35(%eax), %ecx + movl %ecx, -35(%edx) +L(fwd_write_31bytes): + movl -31(%eax), %ecx + movl %ecx, -31(%edx) +L(fwd_write_27bytes): + movl -27(%eax), %ecx + movl %ecx, -27(%edx) +L(fwd_write_23bytes): + movl -23(%eax), %ecx + movl %ecx, -23(%edx) +L(fwd_write_19bytes): + movl -19(%eax), %ecx + movl %ecx, -19(%edx) +L(fwd_write_15bytes): + movl -15(%eax), %ecx + movl %ecx, -15(%edx) +L(fwd_write_11bytes): + movl -11(%eax), %ecx + movl %ecx, -11(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN_END + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(large_page): + movdqu (%eax), %xmm1 + movdqu %xmm0, (%esi) + movntdq %xmm1, (%edx) + add $0x10, %eax + add $0x10, %edx + sub $0x10, %ecx + cmp %al, %dl + je L(copy_page_by_rep) +L(large_page_loop_init): + POP (%esi) + sub $0x80, %ecx + POP (%edi) +L(large_page_loop): + prefetchnta 0x1c0(%eax) + prefetchnta 0x280(%eax) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + lfence + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jb L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(copy_page_by_rep): + mov %eax, %esi + mov %edx, %edi + mov %ecx, %edx + shr $2, %ecx + and $3, %edx + rep movsl + jz L(copy_page_by_rep_exit) + cmp $2, %edx + jb L(copy_page_by_rep_left_1) + movzwl (%esi), %eax + movw %ax, (%edi) + add $2, %esi + add $2, %edi + sub $2, %edx + jz L(copy_page_by_rep_exit) +L(copy_page_by_rep_left_1): + movzbl (%esi), %eax + movb %al, (%edi) +L(copy_page_by_rep_exit): + POP (%esi) + POP (%edi) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_44bytes): + movl 40(%eax), %ecx + movl %ecx, 40(%edx) +L(bk_write_40bytes): + movl 36(%eax), %ecx + movl %ecx, 36(%edx) +L(bk_write_36bytes): + movl 32(%eax), %ecx + movl %ecx, 32(%edx) +L(bk_write_32bytes): + movl 28(%eax), %ecx + movl %ecx, 28(%edx) +L(bk_write_28bytes): + movl 24(%eax), %ecx + movl %ecx, 24(%edx) +L(bk_write_24bytes): + movl 20(%eax), %ecx + movl %ecx, 20(%edx) +L(bk_write_20bytes): + movl 16(%eax), %ecx + movl %ecx, 16(%edx) +L(bk_write_16bytes): + movl 12(%eax), %ecx + movl %ecx, 12(%edx) +L(bk_write_12bytes): + movl 8(%eax), %ecx + movl %ecx, 8(%edx) +L(bk_write_8bytes): + movl 4(%eax), %ecx + movl %ecx, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_45bytes): + movl 41(%eax), %ecx + movl %ecx, 41(%edx) +L(bk_write_41bytes): + movl 37(%eax), %ecx + movl %ecx, 37(%edx) +L(bk_write_37bytes): + movl 33(%eax), %ecx + movl %ecx, 33(%edx) +L(bk_write_33bytes): + movl 29(%eax), %ecx + movl %ecx, 29(%edx) +L(bk_write_29bytes): + movl 25(%eax), %ecx + movl %ecx, 25(%edx) +L(bk_write_25bytes): + movl 21(%eax), %ecx + movl %ecx, 21(%edx) +L(bk_write_21bytes): + movl 17(%eax), %ecx + movl %ecx, 17(%edx) +L(bk_write_17bytes): + movl 13(%eax), %ecx + movl %ecx, 13(%edx) +L(bk_write_13bytes): + movl 9(%eax), %ecx + movl %ecx, 9(%edx) +L(bk_write_9bytes): + movl 5(%eax), %ecx + movl %ecx, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_46bytes): + movl 42(%eax), %ecx + movl %ecx, 42(%edx) +L(bk_write_42bytes): + movl 38(%eax), %ecx + movl %ecx, 38(%edx) +L(bk_write_38bytes): + movl 34(%eax), %ecx + movl %ecx, 34(%edx) +L(bk_write_34bytes): + movl 30(%eax), %ecx + movl %ecx, 30(%edx) +L(bk_write_30bytes): + movl 26(%eax), %ecx + movl %ecx, 26(%edx) +L(bk_write_26bytes): + movl 22(%eax), %ecx + movl %ecx, 22(%edx) +L(bk_write_22bytes): + movl 18(%eax), %ecx + movl %ecx, 18(%edx) +L(bk_write_18bytes): + movl 14(%eax), %ecx + movl %ecx, 14(%edx) +L(bk_write_14bytes): + movl 10(%eax), %ecx + movl %ecx, 10(%edx) +L(bk_write_10bytes): + movl 6(%eax), %ecx + movl %ecx, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_47bytes): + movl 43(%eax), %ecx + movl %ecx, 43(%edx) +L(bk_write_43bytes): + movl 39(%eax), %ecx + movl %ecx, 39(%edx) +L(bk_write_39bytes): + movl 35(%eax), %ecx + movl %ecx, 35(%edx) +L(bk_write_35bytes): + movl 31(%eax), %ecx + movl %ecx, 31(%edx) +L(bk_write_31bytes): + movl 27(%eax), %ecx + movl %ecx, 27(%edx) +L(bk_write_27bytes): + movl 23(%eax), %ecx + movl %ecx, 23(%edx) +L(bk_write_23bytes): + movl 19(%eax), %ecx + movl %ecx, 19(%edx) +L(bk_write_19bytes): + movl 15(%eax), %ecx + movl %ecx, 15(%edx) +L(bk_write_15bytes): + movl 11(%eax), %ecx + movl %ecx, 11(%edx) +L(bk_write_11bytes): + movl 7(%eax), %ecx + movl %ecx, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + ALIGN (2) +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + ALIGN (2) +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + ALIGN (2) +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +#ifdef USE_AS_MEMMOVE + ALIGN (4) +L(copy_backward): + PUSH (%esi) + movl %eax, %esi + add %ecx, %edx + add %ecx, %esi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jae L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jb L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movl -4(%esi), %eax + movl %eax, -4(%edx) + movl -8(%esi), %eax + movl %eax, -8(%edx) + movl -12(%esi), %eax + movl %eax, -12(%edx) + movl -16(%esi), %eax + movl %eax, -16(%edx) + movl -20(%esi), %eax + movl %eax, -20(%edx) + movl -24(%esi), %eax + movl %eax, -24(%edx) + movl -28(%esi), %eax + movl %eax, -28(%edx) + movl -32(%esi), %eax + movl %eax, -32(%edx) + sub $32, %edx + sub $32, %esi + +L(bk_write_less32bytes): + movl %esi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%esi) +L(bk_write_less48bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + CFI_PUSH (%esi) + ALIGN (4) +L(bk_align): + cmp $8, %ecx + jbe L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %esi + sub $1, %ecx + sub $1, %edx + movzbl (%esi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %esi + sub $2, %ecx + sub $2, %edx + movzwl (%esi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + ALIGN (4) +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jb L(bk_write_more32bytes) + +L(bk_ssse3_cpy): + sub $64, %esi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%esi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%esi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%esi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%esi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jae L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +#endif + +END (MEMCPY) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S new file mode 100644 index 0000000000..53e8a6ca1d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S @@ -0,0 +1,3162 @@ +/* memcpy with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +# include <sysdep.h> +# include "asm-syntax.h" + +# ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# endif + +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx, INDEX, SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +# else + +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(, INDEX, SCALE) +# endif + + .section .text.ssse3,"ax",@progbits +# if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +# endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +# ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $32, %ecx + jae L(memmove_bwd) + jmp L(bk_write_less32bytes_2) + + .p2align 4 +L(memmove_bwd): + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +# endif + cmp $48, %ecx + jae L(48bytesormore) + +L(fwd_write_less32bytes): +# ifndef USE_AS_MEMMOVE + cmp %dl, %al + jb L(bk_write) +# endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +# ifndef USE_AS_MEMMOVE + .p2align 4 +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +# endif + + .p2align 4 +L(48bytesormore): +# ifndef USE_AS_MEMMOVE + movlpd (%eax), %xmm0 + movlpd 8(%eax), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) +# else + movdqu (%eax), %xmm0 +# endif + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + add $16, %edx + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +# endif + + mov %eax, %edi + jae L(large_page) + and $0xf, %edi + jz L(shl_0) + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + .p2align 4 +L(shl_0): +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif + xor %edi, %edi + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx + + .p2align 4 +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_0_gobble): +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + POP (%edi) + lea -128(%ecx), %ecx + jae L(shl_0_gobble_mem_loop) + + .p2align 4 +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_cache_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx + +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx + +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx + +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(shl_0_gobble_mem_loop): + prefetcht0 0x1c0(%eax) + prefetcht0 0x280(%eax) + prefetcht0 0x1c0(%edx) + + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_mem_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx + +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx + +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx + +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) + + .p2align 4 +L(shl_1): +# ifndef USE_AS_MEMMOVE + movaps -1(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -1(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_1_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl1LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + movaps 47(%eax), %xmm4 + movaps 63(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + palignr $1, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $1, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $1, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl1LoopStart) + +L(Shl1LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_1_no_prefetch): + lea -32(%ecx), %ecx + lea -1(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_1_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_1_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_1_no_prefetch_loop) + +L(sh_1_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_2): +# ifndef USE_AS_MEMMOVE + movaps -2(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -2(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_2_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl2LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + movaps 46(%eax), %xmm4 + movaps 62(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + palignr $2, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $2, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $2, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl2LoopStart) + +L(Shl2LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_2_no_prefetch): + lea -32(%ecx), %ecx + lea -2(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_2_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_2_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_2_no_prefetch_loop) + +L(sh_2_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_3): +# ifndef USE_AS_MEMMOVE + movaps -3(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -3(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_3_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl3LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + movaps 45(%eax), %xmm4 + movaps 61(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + palignr $3, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $3, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $3, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl3LoopStart) + +L(Shl3LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_3_no_prefetch): + lea -32(%ecx), %ecx + lea -3(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_3_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_3_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_3_no_prefetch_loop) + +L(sh_3_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_4): +# ifndef USE_AS_MEMMOVE + movaps -4(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -4(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_4_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl4LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + movaps 44(%eax), %xmm4 + movaps 60(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + palignr $4, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $4, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $4, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl4LoopStart) + +L(Shl4LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_4_no_prefetch): + lea -32(%ecx), %ecx + lea -4(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_4_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_4_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_4_no_prefetch_loop) + +L(sh_4_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_5): +# ifndef USE_AS_MEMMOVE + movaps -5(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -5(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_5_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl5LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + movaps 43(%eax), %xmm4 + movaps 59(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + palignr $5, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $5, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $5, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl5LoopStart) + +L(Shl5LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_5_no_prefetch): + lea -32(%ecx), %ecx + lea -5(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_5_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_5_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_5_no_prefetch_loop) + +L(sh_5_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_6): +# ifndef USE_AS_MEMMOVE + movaps -6(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -6(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_6_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl6LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + movaps 42(%eax), %xmm4 + movaps 58(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + palignr $6, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $6, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $6, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl6LoopStart) + +L(Shl6LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_6_no_prefetch): + lea -32(%ecx), %ecx + lea -6(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_6_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_6_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_6_no_prefetch_loop) + +L(sh_6_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_7): +# ifndef USE_AS_MEMMOVE + movaps -7(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -7(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_7_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl7LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + movaps 41(%eax), %xmm4 + movaps 57(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + palignr $7, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $7, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $7, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl7LoopStart) + +L(Shl7LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_7_no_prefetch): + lea -32(%ecx), %ecx + lea -7(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_7_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_7_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_7_no_prefetch_loop) + +L(sh_7_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_8): +# ifndef USE_AS_MEMMOVE + movaps -8(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -8(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_8_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl8LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + movaps 40(%eax), %xmm4 + movaps 56(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + palignr $8, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $8, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $8, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl8LoopStart) + +L(LoopLeave8): + add $32, %ecx + jle L(shl_end_0) + + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_8_no_prefetch): + lea -32(%ecx), %ecx + lea -8(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_8_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_8_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_8_no_prefetch_loop) + +L(sh_8_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_9): +# ifndef USE_AS_MEMMOVE + movaps -9(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -9(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_9_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl9LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + movaps 39(%eax), %xmm4 + movaps 55(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + palignr $9, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $9, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $9, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl9LoopStart) + +L(Shl9LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_9_no_prefetch): + lea -32(%ecx), %ecx + lea -9(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_9_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_9_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_9_no_prefetch_loop) + +L(sh_9_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_10): +# ifndef USE_AS_MEMMOVE + movaps -10(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -10(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_10_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl10LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + movaps 38(%eax), %xmm4 + movaps 54(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + palignr $10, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $10, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $10, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl10LoopStart) + +L(Shl10LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_10_no_prefetch): + lea -32(%ecx), %ecx + lea -10(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_10_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_10_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_10_no_prefetch_loop) + +L(sh_10_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_11): +# ifndef USE_AS_MEMMOVE + movaps -11(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -11(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_11_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl11LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + movaps 37(%eax), %xmm4 + movaps 53(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + palignr $11, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $11, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $11, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl11LoopStart) + +L(Shl11LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_11_no_prefetch): + lea -32(%ecx), %ecx + lea -11(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_11_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_11_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_11_no_prefetch_loop) + +L(sh_11_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_12): +# ifndef USE_AS_MEMMOVE + movaps -12(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -12(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_12_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl12LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + movaps 36(%eax), %xmm4 + movaps 52(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + palignr $12, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $12, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $12, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl12LoopStart) + +L(Shl12LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_12_no_prefetch): + lea -32(%ecx), %ecx + lea -12(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_12_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_12_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_12_no_prefetch_loop) + +L(sh_12_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_13): +# ifndef USE_AS_MEMMOVE + movaps -13(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -13(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_13_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl13LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + movaps 35(%eax), %xmm4 + movaps 51(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + palignr $13, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $13, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $13, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl13LoopStart) + +L(Shl13LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_13_no_prefetch): + lea -32(%ecx), %ecx + lea -13(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_13_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_13_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_13_no_prefetch_loop) + +L(sh_13_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_14): +# ifndef USE_AS_MEMMOVE + movaps -14(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -14(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_14_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl14LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + movaps 34(%eax), %xmm4 + movaps 50(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + palignr $14, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $14, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $14, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl14LoopStart) + +L(Shl14LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_14_no_prefetch): + lea -32(%ecx), %ecx + lea -14(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_14_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_14_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_14_no_prefetch_loop) + +L(sh_14_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_15): +# ifndef USE_AS_MEMMOVE + movaps -15(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -15(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_15_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl15LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + movaps 33(%eax), %xmm4 + movaps 49(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + palignr $15, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $15, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $15, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl15LoopStart) + +L(Shl15LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_15_no_prefetch): + lea -32(%ecx), %ecx + lea -15(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_15_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_15_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_15_no_prefetch_loop) + +L(sh_15_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_end_0): + lea 32(%ecx), %ecx + lea (%edx, %ecx), %edx + lea (%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(fwd_write_44bytes): + movq -44(%eax), %xmm0 + movq %xmm0, -44(%edx) +L(fwd_write_36bytes): + movq -36(%eax), %xmm0 + movq %xmm0, -36(%edx) +L(fwd_write_28bytes): + movq -28(%eax), %xmm0 + movq %xmm0, -28(%edx) +L(fwd_write_20bytes): + movq -20(%eax), %xmm0 + movq %xmm0, -20(%edx) +L(fwd_write_12bytes): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_40bytes): + movq -40(%eax), %xmm0 + movq %xmm0, -40(%edx) +L(fwd_write_32bytes): + movq -32(%eax), %xmm0 + movq %xmm0, -32(%edx) +L(fwd_write_24bytes): + movq -24(%eax), %xmm0 + movq %xmm0, -24(%edx) +L(fwd_write_16bytes): + movq -16(%eax), %xmm0 + movq %xmm0, -16(%edx) +L(fwd_write_8bytes): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes): +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_45bytes): + movq -45(%eax), %xmm0 + movq %xmm0, -45(%edx) +L(fwd_write_37bytes): + movq -37(%eax), %xmm0 + movq %xmm0, -37(%edx) +L(fwd_write_29bytes): + movq -29(%eax), %xmm0 + movq %xmm0, -29(%edx) +L(fwd_write_21bytes): + movq -21(%eax), %xmm0 + movq %xmm0, -21(%edx) +L(fwd_write_13bytes): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_41bytes): + movq -41(%eax), %xmm0 + movq %xmm0, -41(%edx) +L(fwd_write_33bytes): + movq -33(%eax), %xmm0 + movq %xmm0, -33(%edx) +L(fwd_write_25bytes): + movq -25(%eax), %xmm0 + movq %xmm0, -25(%edx) +L(fwd_write_17bytes): + movq -17(%eax), %xmm0 + movq %xmm0, -17(%edx) +L(fwd_write_9bytes): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_46bytes): + movq -46(%eax), %xmm0 + movq %xmm0, -46(%edx) +L(fwd_write_38bytes): + movq -38(%eax), %xmm0 + movq %xmm0, -38(%edx) +L(fwd_write_30bytes): + movq -30(%eax), %xmm0 + movq %xmm0, -30(%edx) +L(fwd_write_22bytes): + movq -22(%eax), %xmm0 + movq %xmm0, -22(%edx) +L(fwd_write_14bytes): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_42bytes): + movq -42(%eax), %xmm0 + movq %xmm0, -42(%edx) +L(fwd_write_34bytes): + movq -34(%eax), %xmm0 + movq %xmm0, -34(%edx) +L(fwd_write_26bytes): + movq -26(%eax), %xmm0 + movq %xmm0, -26(%edx) +L(fwd_write_18bytes): + movq -18(%eax), %xmm0 + movq %xmm0, -18(%edx) +L(fwd_write_10bytes): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_47bytes): + movq -47(%eax), %xmm0 + movq %xmm0, -47(%edx) +L(fwd_write_39bytes): + movq -39(%eax), %xmm0 + movq %xmm0, -39(%edx) +L(fwd_write_31bytes): + movq -31(%eax), %xmm0 + movq %xmm0, -31(%edx) +L(fwd_write_23bytes): + movq -23(%eax), %xmm0 + movq %xmm0, -23(%edx) +L(fwd_write_15bytes): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_43bytes): + movq -43(%eax), %xmm0 + movq %xmm0, -43(%edx) +L(fwd_write_35bytes): + movq -35(%eax), %xmm0 + movq %xmm0, -35(%edx) +L(fwd_write_27bytes): + movq -27(%eax), %xmm0 + movq %xmm0, -27(%edx) +L(fwd_write_19bytes): + movq -19(%eax), %xmm0 + movq %xmm0, -19(%edx) +L(fwd_write_11bytes): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_40bytes_align): + movdqa -40(%eax), %xmm0 + movdqa %xmm0, -40(%edx) +L(fwd_write_24bytes_align): + movdqa -24(%eax), %xmm0 + movdqa %xmm0, -24(%edx) +L(fwd_write_8bytes_align): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes_align): +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_32bytes_align): + movdqa -32(%eax), %xmm0 + movdqa %xmm0, -32(%edx) +L(fwd_write_16bytes_align): + movdqa -16(%eax), %xmm0 + movdqa %xmm0, -16(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_5bytes_align): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_45bytes_align): + movdqa -45(%eax), %xmm0 + movdqa %xmm0, -45(%edx) +L(fwd_write_29bytes_align): + movdqa -29(%eax), %xmm0 + movdqa %xmm0, -29(%edx) +L(fwd_write_13bytes_align): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_37bytes_align): + movdqa -37(%eax), %xmm0 + movdqa %xmm0, -37(%edx) +L(fwd_write_21bytes_align): + movdqa -21(%eax), %xmm0 + movdqa %xmm0, -21(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_41bytes_align): + movdqa -41(%eax), %xmm0 + movdqa %xmm0, -41(%edx) +L(fwd_write_25bytes_align): + movdqa -25(%eax), %xmm0 + movdqa %xmm0, -25(%edx) +L(fwd_write_9bytes_align): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes_align): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_33bytes_align): + movdqa -33(%eax), %xmm0 + movdqa %xmm0, -33(%edx) +L(fwd_write_17bytes_align): + movdqa -17(%eax), %xmm0 + movdqa %xmm0, -17(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_46bytes_align): + movdqa -46(%eax), %xmm0 + movdqa %xmm0, -46(%edx) +L(fwd_write_30bytes_align): + movdqa -30(%eax), %xmm0 + movdqa %xmm0, -30(%edx) +L(fwd_write_14bytes_align): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes_align): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_38bytes_align): + movdqa -38(%eax), %xmm0 + movdqa %xmm0, -38(%edx) +L(fwd_write_22bytes_align): + movdqa -22(%eax), %xmm0 + movdqa %xmm0, -22(%edx) + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_42bytes_align): + movdqa -42(%eax), %xmm0 + movdqa %xmm0, -42(%edx) +L(fwd_write_26bytes_align): + movdqa -26(%eax), %xmm0 + movdqa %xmm0, -26(%edx) +L(fwd_write_10bytes_align): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes_align): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_34bytes_align): + movdqa -34(%eax), %xmm0 + movdqa %xmm0, -34(%edx) +L(fwd_write_18bytes_align): + movdqa -18(%eax), %xmm0 + movdqa %xmm0, -18(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_47bytes_align): + movdqa -47(%eax), %xmm0 + movdqa %xmm0, -47(%edx) +L(fwd_write_31bytes_align): + movdqa -31(%eax), %xmm0 + movdqa %xmm0, -31(%edx) +L(fwd_write_15bytes_align): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes_align): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_39bytes_align): + movdqa -39(%eax), %xmm0 + movdqa %xmm0, -39(%edx) +L(fwd_write_23bytes_align): + movdqa -23(%eax), %xmm0 + movdqa %xmm0, -23(%edx) + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_43bytes_align): + movdqa -43(%eax), %xmm0 + movdqa %xmm0, -43(%edx) +L(fwd_write_27bytes_align): + movdqa -27(%eax), %xmm0 + movdqa %xmm0, -27(%edx) +L(fwd_write_11bytes_align): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes_align): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_35bytes_align): + movdqa -35(%eax), %xmm0 + movdqa %xmm0, -35(%edx) +L(fwd_write_19bytes_align): + movdqa -19(%eax), %xmm0 + movdqa %xmm0, -19(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_44bytes_align): + movdqa -44(%eax), %xmm0 + movdqa %xmm0, -44(%edx) +L(fwd_write_28bytes_align): + movdqa -28(%eax), %xmm0 + movdqa %xmm0, -28(%edx) +L(fwd_write_12bytes_align): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes_align): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_36bytes_align): + movdqa -36(%eax), %xmm0 + movdqa %xmm0, -36(%edx) +L(fwd_write_20bytes_align): + movdqa -20(%eax), %xmm0 + movdqa %xmm0, -20(%edx) + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN_END + + CFI_PUSH (%edi) + + .p2align 4 +L(large_page): + movdqu (%eax), %xmm1 +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif + lea 16(%eax), %eax + movntdq %xmm1, (%edx) + lea 16(%edx), %edx + lea -0x90(%ecx), %ecx + POP (%edi) + + .p2align 4 +L(large_page_loop): + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jb L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(bk_write_44bytes): + movq 36(%eax), %xmm0 + movq %xmm0, 36(%edx) +L(bk_write_36bytes): + movq 28(%eax), %xmm0 + movq %xmm0, 28(%edx) +L(bk_write_28bytes): + movq 20(%eax), %xmm0 + movq %xmm0, 20(%edx) +L(bk_write_20bytes): + movq 12(%eax), %xmm0 + movq %xmm0, 12(%edx) +L(bk_write_12bytes): + movq 4(%eax), %xmm0 + movq %xmm0, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_40bytes): + movq 32(%eax), %xmm0 + movq %xmm0, 32(%edx) +L(bk_write_32bytes): + movq 24(%eax), %xmm0 + movq %xmm0, 24(%edx) +L(bk_write_24bytes): + movq 16(%eax), %xmm0 + movq %xmm0, 16(%edx) +L(bk_write_16bytes): + movq 8(%eax), %xmm0 + movq %xmm0, 8(%edx) +L(bk_write_8bytes): + movq (%eax), %xmm0 + movq %xmm0, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_45bytes): + movq 37(%eax), %xmm0 + movq %xmm0, 37(%edx) +L(bk_write_37bytes): + movq 29(%eax), %xmm0 + movq %xmm0, 29(%edx) +L(bk_write_29bytes): + movq 21(%eax), %xmm0 + movq %xmm0, 21(%edx) +L(bk_write_21bytes): + movq 13(%eax), %xmm0 + movq %xmm0, 13(%edx) +L(bk_write_13bytes): + movq 5(%eax), %xmm0 + movq %xmm0, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_41bytes): + movq 33(%eax), %xmm0 + movq %xmm0, 33(%edx) +L(bk_write_33bytes): + movq 25(%eax), %xmm0 + movq %xmm0, 25(%edx) +L(bk_write_25bytes): + movq 17(%eax), %xmm0 + movq %xmm0, 17(%edx) +L(bk_write_17bytes): + movq 9(%eax), %xmm0 + movq %xmm0, 9(%edx) +L(bk_write_9bytes): + movq 1(%eax), %xmm0 + movq %xmm0, 1(%edx) + movzbl (%eax), %ecx + movb %cl, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_46bytes): + movq 38(%eax), %xmm0 + movq %xmm0, 38(%edx) +L(bk_write_38bytes): + movq 30(%eax), %xmm0 + movq %xmm0, 30(%edx) +L(bk_write_30bytes): + movq 22(%eax), %xmm0 + movq %xmm0, 22(%edx) +L(bk_write_22bytes): + movq 14(%eax), %xmm0 + movq %xmm0, 14(%edx) +L(bk_write_14bytes): + movq 6(%eax), %xmm0 + movq %xmm0, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) + movzwl (%eax), %ecx + movw %cx, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_42bytes): + movq 34(%eax), %xmm0 + movq %xmm0, 34(%edx) +L(bk_write_34bytes): + movq 26(%eax), %xmm0 + movq %xmm0, 26(%edx) +L(bk_write_26bytes): + movq 18(%eax), %xmm0 + movq %xmm0, 18(%edx) +L(bk_write_18bytes): + movq 10(%eax), %xmm0 + movq %xmm0, 10(%edx) +L(bk_write_10bytes): + movq 2(%eax), %xmm0 + movq %xmm0, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_47bytes): + movq 39(%eax), %xmm0 + movq %xmm0, 39(%edx) +L(bk_write_39bytes): + movq 31(%eax), %xmm0 + movq %xmm0, 31(%edx) +L(bk_write_31bytes): + movq 23(%eax), %xmm0 + movq %xmm0, 23(%edx) +L(bk_write_23bytes): + movq 15(%eax), %xmm0 + movq %xmm0, 15(%edx) +L(bk_write_15bytes): + movq 7(%eax), %xmm0 + movq %xmm0, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_43bytes): + movq 35(%eax), %xmm0 + movq %xmm0, 35(%edx) +L(bk_write_35bytes): + movq 27(%eax), %xmm0 + movq %xmm0, 27(%edx) +L(bk_write_27bytes): + movq 19(%eax), %xmm0 + movq %xmm0, 19(%edx) +L(bk_write_19bytes): + movq 11(%eax), %xmm0 + movq %xmm0, 11(%edx) +L(bk_write_11bytes): + movq 3(%eax), %xmm0 + movq %xmm0, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + .p2align 2 +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + .p2align 2 +L(table_48bytes_fwd_align): + .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) + + .p2align 2 +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + .p2align 2 +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +# ifdef USE_AS_MEMMOVE + .p2align 4 +L(copy_backward): + PUSH (%edi) + movl %eax, %edi + lea (%ecx,%edx,1),%edx + lea (%ecx,%edi,1),%edi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jae L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jb L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movq -8(%edi), %xmm0 + movq %xmm0, -8(%edx) + movq -16(%edi), %xmm0 + movq %xmm0, -16(%edx) + movq -24(%edi), %xmm0 + movq %xmm0, -24(%edx) + movq -32(%edi), %xmm0 + movq %xmm0, -32(%edx) + sub $32, %edx + sub $32, %edi + +L(bk_write_less32bytes): + movl %edi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%edi) +L(bk_write_less32bytes_2): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(bk_align): + cmp $8, %ecx + jbe L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %edi + sub $1, %ecx + sub $1, %edx + movzbl (%edi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %edi + sub $2, %ecx + sub $2, %edx + movzwl (%edi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + .p2align 4 +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jb L(bk_write_more32bytes) + + .p2align 4 +L(bk_ssse3_cpy): + sub $64, %edi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%edi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%edi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%edi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%edi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jae L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +# endif + +END (MEMCPY) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S new file mode 100644 index 0000000000..f725944620 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S @@ -0,0 +1,78 @@ +/* Multiple versions of memcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need memcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(memcpy) + .type memcpy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memcpy_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep) +2: ret +END(memcpy) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcpy_ia32, @function; \ + .p2align 4; \ + .globl __memcpy_ia32; \ + .hidden __memcpy_ia32; \ + __memcpy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memcpy_chk_ia32, @function; \ + .globl __memcpy_chk_ia32; \ + .p2align 4; \ + __memcpy_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32 + +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcpy; __GI_memcpy = __memcpy_ia32 +#endif + +#include "../memcpy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S new file mode 100644 index 0000000000..1b4fbe2e6f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S @@ -0,0 +1,50 @@ +/* Multiple versions of __memcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch memcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__memcpy_chk) + .type __memcpy_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep) +2: ret +END(__memcpy_chk) +# else +# include "../memcpy_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S new file mode 100644 index 0000000000..3873594cb2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_sse2_unaligned +#define MEMCPY_CHK __memmove_chk_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S new file mode 100644 index 0000000000..d202fc4a13 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3_rep +#define MEMCPY_CHK __memmove_chk_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S new file mode 100644 index 0000000000..295430b1ef --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3 +#define MEMCPY_CHK __memmove_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S new file mode 100644 index 0000000000..6eb418ca7f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S @@ -0,0 +1,89 @@ +/* Multiple versions of memmove + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(memmove) + .type memmove, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memmove_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep) +2: ret +END(memmove) + +# ifdef SHARED +# undef ENTRY +# define ENTRY(name) \ + .type __memmove_ia32, @function; \ + .p2align 4; \ + .globl __memmove_ia32; \ + .hidden __memmove_ia32; \ + __memmove_ia32: cfi_startproc; \ + CALL_MCOUNT +# else +# undef ENTRY +# define ENTRY(name) \ + .type __memmove_ia32, @function; \ + .globl __memmove_ia32; \ + .p2align 4; \ + __memmove_ia32: cfi_startproc; \ + CALL_MCOUNT +# endif + +# undef END +# define END(name) \ + cfi_endproc; .size __memmove_ia32, .-__memmove_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memmove_chk_ia32, @function; \ + .globl __memmove_chk_ia32; \ + .p2align 4; \ + __memmove_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memmove; __GI_memmove = __memmove_ia32 +# endif +#endif + +#include "../memmove.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S new file mode 100644 index 0000000000..314834c4c6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S @@ -0,0 +1,94 @@ +/* Multiple versions of __memmove_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(__memmove_chk) + .type __memmove_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memmove_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep) +2: ret +END(__memmove_chk) + +# ifndef SHARED + .type __memmove_chk_sse2_unaligned, @function + .p2align 4; +__memmove_chk_sse2_unaligned: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_sse2_unaligned + cfi_endproc + .size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned + + .type __memmove_chk_ssse3, @function + .p2align 4; +__memmove_chk_ssse3: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ssse3 + cfi_endproc + .size __memmove_chk_ssse3, .-__memmove_chk_ssse3 + + .type __memmove_chk_ssse3_rep, @function + .p2align 4; +__memmove_chk_ssse3_rep: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ssse3_rep + cfi_endproc + .size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep + + .type __memmove_chk_ia32, @function + .p2align 4; +__memmove_chk_ia32: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ia32 + cfi_endproc + .size __memmove_chk_ia32, .-__memmove_chk_ia32 +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S new file mode 100644 index 0000000000..a1cea50771 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_sse2_unaligned +#define MEMCPY_CHK __mempcpy_chk_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S new file mode 100644 index 0000000000..5357b33e18 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_ssse3_rep +#define MEMCPY_CHK __mempcpy_chk_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S new file mode 100644 index 0000000000..822d98e954 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_ssse3 +#define MEMCPY_CHK __mempcpy_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S new file mode 100644 index 0000000000..06e377fbc9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S @@ -0,0 +1,81 @@ +/* Multiple versions of mempcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need mempcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(__mempcpy) + .type __mempcpy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__mempcpy_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep) +2: ret +END(__mempcpy) + +# undef ENTRY +# define ENTRY(name) \ + .type __mempcpy_ia32, @function; \ + .p2align 4; \ + .globl __mempcpy_ia32; \ + .hidden __mempcpy_ia32; \ + __mempcpy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __mempcpy_chk_ia32, @function; \ + .globl __mempcpy_chk_ia32; \ + .p2align 4; \ + __mempcpy_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32 + +# undef libc_hidden_def +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_def(name) \ + .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32 +# define libc_hidden_builtin_def(name) \ + .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32 +#endif + +#include "../mempcpy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S new file mode 100644 index 0000000000..e13e5248a5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S @@ -0,0 +1,50 @@ +/* Multiple versions of __mempcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch mempcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__mempcpy_chk) + .type __mempcpy_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep) +2: ret +END(__mempcpy_chk) +# else +# include "../mempcpy_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c new file mode 100644 index 0000000000..ef7bbbe792 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c @@ -0,0 +1,7 @@ +#if IS_IN (libc) +# define MEMRCHR __memrchr_ia32 +# include <string.h> +extern void *__memrchr_ia32 (const void *, int, size_t); +#endif + +#include "string/memrchr.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S new file mode 100644 index 0000000000..dbbe94fd08 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S @@ -0,0 +1,417 @@ +/* Optimized memrchr with sse2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# define MEMCHR __memrchr_sse2_bsf + + .text +ENTRY (MEMCHR) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + add $16, %ecx + add $16, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches0) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + add $64, %ecx + add $64, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + bsr %eax, %eax + + add %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsr %eax, %eax + add %ecx, %eax + ret + + .p2align 4 +L(matches16): + bsr %eax, %eax + lea 16(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches32): + bsr %eax, %eax + lea 32(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches48): + bsr %eax, %eax + lea 48(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches0_1): + bsr %eax, %eax + sub $64, %edx + add %eax, %edx + jl L(return_null) + add %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + bsr %eax, %eax + sub $48, %edx + add %eax, %edx + jl L(return_null) + lea 16(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches32_1): + bsr %eax, %eax + sub $32, %edx + add %eax, %edx + jl L(return_null) + lea 32(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches48_1): + bsr %eax, %eax + sub $16, %edx + add %eax, %edx + jl L(return_null) + lea 48(%ecx, %eax), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + mov %edx, %ecx + + pmovmskb %xmm1, %edx + + and %ecx, %edx + test %edx, %edx + jz L(return_null) + + bsr %edx, %ecx + add %ecx, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + mov %ecx, %eax + punpcklbw %xmm1, %xmm1 + add $16, %edx + jz L(return_null) + + pshufd $0, %xmm1, %xmm1 + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (MEMCHR) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S new file mode 100644 index 0000000000..5f7853f683 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S @@ -0,0 +1,724 @@ +/* Optimized memrchr with sse2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + + atom_text_section +ENTRY (__memrchr_sse2) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(exit_dispatch) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + lea 16(%ecx), %ecx + lea 16(%edx), %edx + sub %eax, %edx + and $-16, %ecx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(exit_dispatch) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(exit_dispatch) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + lea 64(%ecx), %ecx + lea 64(%edx), %edx + and $-64, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches16): + lea 16(%ecx), %ecx + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(matches32): + lea 32(%ecx), %ecx + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(matches48): + lea 48(%ecx), %ecx + + .p2align 4 +L(exit_dispatch): + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_dispatch_8): + test $0x80, %al + jnz L(exit_8) + test $0x40, %al + jnz L(exit_7) + test $0x20, %al + jnz L(exit_6) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_high): + mov %ah, %dh + and $15 << 4, %dh + jnz L(exit_dispatch_high_8) + test $0x08, %ah + jnz L(exit_12) + test $0x04, %ah + jnz L(exit_11) + test $0x02, %ah + jnz L(exit_10) + lea 8(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_high_8): + test $0x80, %ah + jnz L(exit_16) + test $0x40, %ah + jnz L(exit_15) + test $0x20, %ah + jnz L(exit_14) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit_2): + lea 1(%ecx), %eax + ret + + .p2align 4 +L(exit_3): + lea 2(%ecx), %eax + ret + + .p2align 4 +L(exit_4): + lea 3(%ecx), %eax + ret + + .p2align 4 +L(exit_6): + lea 5(%ecx), %eax + ret + + .p2align 4 +L(exit_7): + lea 6(%ecx), %eax + ret + + .p2align 4 +L(exit_8): + lea 7(%ecx), %eax + ret + + .p2align 4 +L(exit_10): + lea 9(%ecx), %eax + ret + + .p2align 4 +L(exit_11): + lea 10(%ecx), %eax + ret + + .p2align 4 +L(exit_12): + lea 11(%ecx), %eax + ret + + .p2align 4 +L(exit_14): + lea 13(%ecx), %eax + ret + + .p2align 4 +L(exit_15): + lea 14(%ecx), %eax + ret + + .p2align 4 +L(exit_16): + lea 15(%ecx), %eax + ret + + .p2align 4 +L(matches0_1): + lea -64(%edx), %edx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + lea -48(%edx), %edx + lea 16(%ecx), %ecx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches32_1): + lea -32(%edx), %edx + lea 32(%ecx), %ecx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches48_1): + lea -16(%edx), %edx + lea 48(%ecx), %ecx + + .p2align 4 +L(exit_dispatch_1): + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_dispatch_1_8): + test $0x80, %al + jnz L(exit_1_8) + test $0x40, %al + jnz L(exit_1_7) + test $0x20, %al + jnz L(exit_1_6) + add $4, %edx + jl L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_1_high): + mov %ah, %al + and $15 << 4, %al + jnz L(exit_dispatch_1_high_8) + test $0x08, %ah + jnz L(exit_1_12) + test $0x04, %ah + jnz L(exit_1_11) + test $0x02, %ah + jnz L(exit_1_10) + add $8, %edx + jl L(return_null) + lea 8(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_1_high_8): + test $0x80, %ah + jnz L(exit_1_16) + test $0x40, %ah + jnz L(exit_1_15) + test $0x20, %ah + jnz L(exit_1_14) + add $12, %edx + jl L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit_1_2): + add $1, %edx + jl L(return_null) + lea 1(%ecx), %eax + ret + + .p2align 4 +L(exit_1_3): + add $2, %edx + jl L(return_null) + lea 2(%ecx), %eax + ret + + .p2align 4 +L(exit_1_4): + add $3, %edx + jl L(return_null) + lea 3(%ecx), %eax + ret + + .p2align 4 +L(exit_1_6): + add $5, %edx + jl L(return_null) + lea 5(%ecx), %eax + ret + + .p2align 4 +L(exit_1_7): + add $6, %edx + jl L(return_null) + lea 6(%ecx), %eax + ret + + .p2align 4 +L(exit_1_8): + add $7, %edx + jl L(return_null) + lea 7(%ecx), %eax + ret + + .p2align 4 +L(exit_1_10): + add $9, %edx + jl L(return_null) + lea 9(%ecx), %eax + ret + + .p2align 4 +L(exit_1_11): + add $10, %edx + jl L(return_null) + lea 10(%ecx), %eax + ret + + .p2align 4 +L(exit_1_12): + add $11, %edx + jl L(return_null) + lea 11(%ecx), %eax + ret + + .p2align 4 +L(exit_1_14): + add $13, %edx + jl L(return_null) + lea 13(%ecx), %eax + ret + + .p2align 4 +L(exit_1_15): + add $14, %edx + jl L(return_null) + lea 14(%ecx), %eax + ret + + .p2align 4 +L(exit_1_16): + add $15, %edx + jl L(return_null) + lea 15(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + mov %eax, %ecx + pmovmskb %xmm1, %eax + + and %edx, %eax + test %eax, %eax + jnz L(exit_dispatch) + + xor %eax, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + add $16, %edx + je L(return_null) + punpcklbw %xmm1, %xmm1 + + mov %ecx, %eax + pshufd $0, %xmm1, %xmm1 + + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (__memrchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S new file mode 100644 index 0000000000..d4253a553b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S @@ -0,0 +1,45 @@ +/* Multiple versions of memrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__memrchr) + .type __memrchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX (__memrchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__memrchr_ia32) + ret + +3: LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf) + ret +END(__memrchr) + +weak_alias(__memrchr, memrchr) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S new file mode 100644 index 0000000000..3221077e49 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S @@ -0,0 +1,811 @@ +/* memset with SSE2 and REP string. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_BZERO +# define DEST PARMS +# define LEN DEST+4 +# define SETRTNVAL +#else +# define DEST PARMS +# define CHR DEST+4 +# define LEN CHR+4 +# define SETRTNVAL movl DEST(%esp), %eax +#endif + +#ifdef SHARED +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define PARMS 8 /* Preserve EBX. */ +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + add $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + add (%ebx,%ecx,4), %ebx; \ + add %ecx, %edx; \ + /* We loaded the jump table and adjusted EDX. Go. */ \ + jmp *%ebx +#else +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define PARMS 4 +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + add %ecx, %edx; \ + jmp *TABLE(,%ecx,4) +#endif + + .section .text.sse2,"ax",@progbits +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO +ENTRY (__memset_chk_sse2_rep) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_sse2_rep) +#endif +ENTRY (__memset_sse2_rep) + ENTRANCE + + movl LEN(%esp), %ecx +#ifdef USE_AS_BZERO + xor %eax, %eax +#else + movzbl CHR(%esp), %eax + movb %al, %ah + /* Fill the whole EAX with pattern. */ + movl %eax, %edx + shl $16, %eax + or %edx, %eax +#endif + movl DEST(%esp), %edx + cmp $32, %ecx + jae L(32bytesormore) + +L(write_less32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_less_32bytes): + .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) + .popsection + + ALIGN (4) +L(write_28bytes): + movl %eax, -28(%edx) +L(write_24bytes): + movl %eax, -24(%edx) +L(write_20bytes): + movl %eax, -20(%edx) +L(write_16bytes): + movl %eax, -16(%edx) +L(write_12bytes): + movl %eax, -12(%edx) +L(write_8bytes): + movl %eax, -8(%edx) +L(write_4bytes): + movl %eax, -4(%edx) +L(write_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(write_29bytes): + movl %eax, -29(%edx) +L(write_25bytes): + movl %eax, -25(%edx) +L(write_21bytes): + movl %eax, -21(%edx) +L(write_17bytes): + movl %eax, -17(%edx) +L(write_13bytes): + movl %eax, -13(%edx) +L(write_9bytes): + movl %eax, -9(%edx) +L(write_5bytes): + movl %eax, -5(%edx) +L(write_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_30bytes): + movl %eax, -30(%edx) +L(write_26bytes): + movl %eax, -26(%edx) +L(write_22bytes): + movl %eax, -22(%edx) +L(write_18bytes): + movl %eax, -18(%edx) +L(write_14bytes): + movl %eax, -14(%edx) +L(write_10bytes): + movl %eax, -10(%edx) +L(write_6bytes): + movl %eax, -6(%edx) +L(write_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_31bytes): + movl %eax, -31(%edx) +L(write_27bytes): + movl %eax, -27(%edx) +L(write_23bytes): + movl %eax, -23(%edx) +L(write_19bytes): + movl %eax, -19(%edx) +L(write_15bytes): + movl %eax, -15(%edx) +L(write_11bytes): + movl %eax, -11(%edx) +L(write_7bytes): + movl %eax, -7(%edx) +L(write_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(32bytesormore): + /* Fill xmm0 with the pattern. */ +#ifdef USE_AS_BZERO + pxor %xmm0, %xmm0 +#else + movd %eax, %xmm0 + pshufd $0, %xmm0, %xmm0 +#endif + testl $0xf, %edx + jz L(aligned_16) +/* ECX > 32 and EDX is not 16 byte aligned. */ +L(not_aligned_16): + movdqu %xmm0, (%edx) + movl %edx, %eax + and $-16, %edx + add $16, %edx + sub %edx, %eax + add %eax, %ecx + movd %xmm0, %eax + + ALIGN (4) +L(aligned_16): + cmp $128, %ecx + jae L(128bytesormore) + +L(aligned_16_less128bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytesormore): + PUSH (%edi) +#ifdef DATA_CACHE_SIZE + PUSH (%ebx) + mov $DATA_CACHE_SIZE, %ebx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_data_cache_size@GOTOFF(%ebx), %ebx +# else + PUSH (%ebx) + mov __x86_data_cache_size, %ebx +# endif +#endif + mov %ebx, %edi + shr $4, %ebx + sub %ebx, %edi +#if defined DATA_CACHE_SIZE || !defined SHARED + POP (%ebx) +#endif +/* + * When data size approximate the end of L1 cache, + * fast string will prefetch and combine data efficiently. + */ + cmp %edi, %ecx + jae L(128bytesormore_endof_L1) + subl $128, %ecx +L(128bytesormore_normal): + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jb L(128bytesless_normal) + + + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jae L(128bytesormore_normal) + +L(128bytesless_normal): + POP (%edi) + add $128, %ecx + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + CFI_PUSH (%edi) + ALIGN (4) +L(128bytesormore_endof_L1): + mov %edx, %edi + mov %ecx, %edx + shr $2, %ecx + and $3, %edx + rep stosl + jz L(copy_page_by_rep_exit) + cmp $2, %edx + jb L(copy_page_by_rep_left_1) + movw %ax, (%edi) + add $2, %edi + sub $2, %edx + jz L(copy_page_by_rep_exit) +L(copy_page_by_rep_left_1): + movb %al, (%edi) +L(copy_page_by_rep_exit): + POP (%edi) + SETRTNVAL + RETURN + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_16_128bytes): + .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) + .popsection + + ALIGN (4) +L(aligned_16_112bytes): + movdqa %xmm0, -112(%edx) +L(aligned_16_96bytes): + movdqa %xmm0, -96(%edx) +L(aligned_16_80bytes): + movdqa %xmm0, -80(%edx) +L(aligned_16_64bytes): + movdqa %xmm0, -64(%edx) +L(aligned_16_48bytes): + movdqa %xmm0, -48(%edx) +L(aligned_16_32bytes): + movdqa %xmm0, -32(%edx) +L(aligned_16_16bytes): + movdqa %xmm0, -16(%edx) +L(aligned_16_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_113bytes): + movdqa %xmm0, -113(%edx) +L(aligned_16_97bytes): + movdqa %xmm0, -97(%edx) +L(aligned_16_81bytes): + movdqa %xmm0, -81(%edx) +L(aligned_16_65bytes): + movdqa %xmm0, -65(%edx) +L(aligned_16_49bytes): + movdqa %xmm0, -49(%edx) +L(aligned_16_33bytes): + movdqa %xmm0, -33(%edx) +L(aligned_16_17bytes): + movdqa %xmm0, -17(%edx) +L(aligned_16_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_114bytes): + movdqa %xmm0, -114(%edx) +L(aligned_16_98bytes): + movdqa %xmm0, -98(%edx) +L(aligned_16_82bytes): + movdqa %xmm0, -82(%edx) +L(aligned_16_66bytes): + movdqa %xmm0, -66(%edx) +L(aligned_16_50bytes): + movdqa %xmm0, -50(%edx) +L(aligned_16_34bytes): + movdqa %xmm0, -34(%edx) +L(aligned_16_18bytes): + movdqa %xmm0, -18(%edx) +L(aligned_16_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_115bytes): + movdqa %xmm0, -115(%edx) +L(aligned_16_99bytes): + movdqa %xmm0, -99(%edx) +L(aligned_16_83bytes): + movdqa %xmm0, -83(%edx) +L(aligned_16_67bytes): + movdqa %xmm0, -67(%edx) +L(aligned_16_51bytes): + movdqa %xmm0, -51(%edx) +L(aligned_16_35bytes): + movdqa %xmm0, -35(%edx) +L(aligned_16_19bytes): + movdqa %xmm0, -19(%edx) +L(aligned_16_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_116bytes): + movdqa %xmm0, -116(%edx) +L(aligned_16_100bytes): + movdqa %xmm0, -100(%edx) +L(aligned_16_84bytes): + movdqa %xmm0, -84(%edx) +L(aligned_16_68bytes): + movdqa %xmm0, -68(%edx) +L(aligned_16_52bytes): + movdqa %xmm0, -52(%edx) +L(aligned_16_36bytes): + movdqa %xmm0, -36(%edx) +L(aligned_16_20bytes): + movdqa %xmm0, -20(%edx) +L(aligned_16_4bytes): + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_117bytes): + movdqa %xmm0, -117(%edx) +L(aligned_16_101bytes): + movdqa %xmm0, -101(%edx) +L(aligned_16_85bytes): + movdqa %xmm0, -85(%edx) +L(aligned_16_69bytes): + movdqa %xmm0, -69(%edx) +L(aligned_16_53bytes): + movdqa %xmm0, -53(%edx) +L(aligned_16_37bytes): + movdqa %xmm0, -37(%edx) +L(aligned_16_21bytes): + movdqa %xmm0, -21(%edx) +L(aligned_16_5bytes): + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_118bytes): + movdqa %xmm0, -118(%edx) +L(aligned_16_102bytes): + movdqa %xmm0, -102(%edx) +L(aligned_16_86bytes): + movdqa %xmm0, -86(%edx) +L(aligned_16_70bytes): + movdqa %xmm0, -70(%edx) +L(aligned_16_54bytes): + movdqa %xmm0, -54(%edx) +L(aligned_16_38bytes): + movdqa %xmm0, -38(%edx) +L(aligned_16_22bytes): + movdqa %xmm0, -22(%edx) +L(aligned_16_6bytes): + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_119bytes): + movdqa %xmm0, -119(%edx) +L(aligned_16_103bytes): + movdqa %xmm0, -103(%edx) +L(aligned_16_87bytes): + movdqa %xmm0, -87(%edx) +L(aligned_16_71bytes): + movdqa %xmm0, -71(%edx) +L(aligned_16_55bytes): + movdqa %xmm0, -55(%edx) +L(aligned_16_39bytes): + movdqa %xmm0, -39(%edx) +L(aligned_16_23bytes): + movdqa %xmm0, -23(%edx) +L(aligned_16_7bytes): + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_120bytes): + movdqa %xmm0, -120(%edx) +L(aligned_16_104bytes): + movdqa %xmm0, -104(%edx) +L(aligned_16_88bytes): + movdqa %xmm0, -88(%edx) +L(aligned_16_72bytes): + movdqa %xmm0, -72(%edx) +L(aligned_16_56bytes): + movdqa %xmm0, -56(%edx) +L(aligned_16_40bytes): + movdqa %xmm0, -40(%edx) +L(aligned_16_24bytes): + movdqa %xmm0, -24(%edx) +L(aligned_16_8bytes): + movq %xmm0, -8(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_121bytes): + movdqa %xmm0, -121(%edx) +L(aligned_16_105bytes): + movdqa %xmm0, -105(%edx) +L(aligned_16_89bytes): + movdqa %xmm0, -89(%edx) +L(aligned_16_73bytes): + movdqa %xmm0, -73(%edx) +L(aligned_16_57bytes): + movdqa %xmm0, -57(%edx) +L(aligned_16_41bytes): + movdqa %xmm0, -41(%edx) +L(aligned_16_25bytes): + movdqa %xmm0, -25(%edx) +L(aligned_16_9bytes): + movq %xmm0, -9(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_122bytes): + movdqa %xmm0, -122(%edx) +L(aligned_16_106bytes): + movdqa %xmm0, -106(%edx) +L(aligned_16_90bytes): + movdqa %xmm0, -90(%edx) +L(aligned_16_74bytes): + movdqa %xmm0, -74(%edx) +L(aligned_16_58bytes): + movdqa %xmm0, -58(%edx) +L(aligned_16_42bytes): + movdqa %xmm0, -42(%edx) +L(aligned_16_26bytes): + movdqa %xmm0, -26(%edx) +L(aligned_16_10bytes): + movq %xmm0, -10(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_123bytes): + movdqa %xmm0, -123(%edx) +L(aligned_16_107bytes): + movdqa %xmm0, -107(%edx) +L(aligned_16_91bytes): + movdqa %xmm0, -91(%edx) +L(aligned_16_75bytes): + movdqa %xmm0, -75(%edx) +L(aligned_16_59bytes): + movdqa %xmm0, -59(%edx) +L(aligned_16_43bytes): + movdqa %xmm0, -43(%edx) +L(aligned_16_27bytes): + movdqa %xmm0, -27(%edx) +L(aligned_16_11bytes): + movq %xmm0, -11(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_124bytes): + movdqa %xmm0, -124(%edx) +L(aligned_16_108bytes): + movdqa %xmm0, -108(%edx) +L(aligned_16_92bytes): + movdqa %xmm0, -92(%edx) +L(aligned_16_76bytes): + movdqa %xmm0, -76(%edx) +L(aligned_16_60bytes): + movdqa %xmm0, -60(%edx) +L(aligned_16_44bytes): + movdqa %xmm0, -44(%edx) +L(aligned_16_28bytes): + movdqa %xmm0, -28(%edx) +L(aligned_16_12bytes): + movq %xmm0, -12(%edx) + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_125bytes): + movdqa %xmm0, -125(%edx) +L(aligned_16_109bytes): + movdqa %xmm0, -109(%edx) +L(aligned_16_93bytes): + movdqa %xmm0, -93(%edx) +L(aligned_16_77bytes): + movdqa %xmm0, -77(%edx) +L(aligned_16_61bytes): + movdqa %xmm0, -61(%edx) +L(aligned_16_45bytes): + movdqa %xmm0, -45(%edx) +L(aligned_16_29bytes): + movdqa %xmm0, -29(%edx) +L(aligned_16_13bytes): + movq %xmm0, -13(%edx) + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_126bytes): + movdqa %xmm0, -126(%edx) +L(aligned_16_110bytes): + movdqa %xmm0, -110(%edx) +L(aligned_16_94bytes): + movdqa %xmm0, -94(%edx) +L(aligned_16_78bytes): + movdqa %xmm0, -78(%edx) +L(aligned_16_62bytes): + movdqa %xmm0, -62(%edx) +L(aligned_16_46bytes): + movdqa %xmm0, -46(%edx) +L(aligned_16_30bytes): + movdqa %xmm0, -30(%edx) +L(aligned_16_14bytes): + movq %xmm0, -14(%edx) + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_127bytes): + movdqa %xmm0, -127(%edx) +L(aligned_16_111bytes): + movdqa %xmm0, -111(%edx) +L(aligned_16_95bytes): + movdqa %xmm0, -95(%edx) +L(aligned_16_79bytes): + movdqa %xmm0, -79(%edx) +L(aligned_16_63bytes): + movdqa %xmm0, -63(%edx) +L(aligned_16_47bytes): + movdqa %xmm0, -47(%edx) +L(aligned_16_31bytes): + movdqa %xmm0, -31(%edx) +L(aligned_16_15bytes): + movq %xmm0, -15(%edx) + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN_END + +END (__memset_sse2_rep) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S new file mode 100644 index 0000000000..d7b8be9114 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S @@ -0,0 +1,860 @@ +/* memset with SSE2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_BZERO +# define DEST PARMS +# define LEN DEST+4 +# define SETRTNVAL +#else +# define DEST PARMS +# define CHR DEST+4 +# define LEN CHR+4 +# define SETRTNVAL movl DEST(%esp), %eax +#endif + +#ifdef SHARED +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define PARMS 8 /* Preserve EBX. */ +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + add $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + add (%ebx,%ecx,4), %ebx; \ + add %ecx, %edx; \ + /* We loaded the jump table and adjusted EDX. Go. */ \ + jmp *%ebx +#else +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define PARMS 4 +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + add %ecx, %edx; \ + jmp *TABLE(,%ecx,4) +#endif + + .section .text.sse2,"ax",@progbits +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO +ENTRY (__memset_chk_sse2) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_sse2) +#endif +ENTRY (__memset_sse2) + ENTRANCE + + movl LEN(%esp), %ecx +#ifdef USE_AS_BZERO + xor %eax, %eax +#else + movzbl CHR(%esp), %eax + movb %al, %ah + /* Fill the whole EAX with pattern. */ + movl %eax, %edx + shl $16, %eax + or %edx, %eax +#endif + movl DEST(%esp), %edx + cmp $32, %ecx + jae L(32bytesormore) + +L(write_less32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_less_32bytes): + .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) + .popsection + + ALIGN (4) +L(write_28bytes): + movl %eax, -28(%edx) +L(write_24bytes): + movl %eax, -24(%edx) +L(write_20bytes): + movl %eax, -20(%edx) +L(write_16bytes): + movl %eax, -16(%edx) +L(write_12bytes): + movl %eax, -12(%edx) +L(write_8bytes): + movl %eax, -8(%edx) +L(write_4bytes): + movl %eax, -4(%edx) +L(write_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(write_29bytes): + movl %eax, -29(%edx) +L(write_25bytes): + movl %eax, -25(%edx) +L(write_21bytes): + movl %eax, -21(%edx) +L(write_17bytes): + movl %eax, -17(%edx) +L(write_13bytes): + movl %eax, -13(%edx) +L(write_9bytes): + movl %eax, -9(%edx) +L(write_5bytes): + movl %eax, -5(%edx) +L(write_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_30bytes): + movl %eax, -30(%edx) +L(write_26bytes): + movl %eax, -26(%edx) +L(write_22bytes): + movl %eax, -22(%edx) +L(write_18bytes): + movl %eax, -18(%edx) +L(write_14bytes): + movl %eax, -14(%edx) +L(write_10bytes): + movl %eax, -10(%edx) +L(write_6bytes): + movl %eax, -6(%edx) +L(write_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_31bytes): + movl %eax, -31(%edx) +L(write_27bytes): + movl %eax, -27(%edx) +L(write_23bytes): + movl %eax, -23(%edx) +L(write_19bytes): + movl %eax, -19(%edx) +L(write_15bytes): + movl %eax, -15(%edx) +L(write_11bytes): + movl %eax, -11(%edx) +L(write_7bytes): + movl %eax, -7(%edx) +L(write_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(32bytesormore): + /* Fill xmm0 with the pattern. */ +#ifdef USE_AS_BZERO + pxor %xmm0, %xmm0 +#else + movd %eax, %xmm0 + pshufd $0, %xmm0, %xmm0 +#endif + testl $0xf, %edx + jz L(aligned_16) +/* ECX > 32 and EDX is not 16 byte aligned. */ +L(not_aligned_16): + movdqu %xmm0, (%edx) + movl %edx, %eax + and $-16, %edx + add $16, %edx + sub %edx, %eax + add %eax, %ecx + movd %xmm0, %eax + + ALIGN (4) +L(aligned_16): + cmp $128, %ecx + jae L(128bytesormore) + +L(aligned_16_less128bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytesormore): +#ifdef SHARED_CACHE_SIZE + PUSH (%ebx) + mov $SHARED_CACHE_SIZE, %ebx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx +# else + PUSH (%ebx) + mov __x86_shared_cache_size, %ebx +# endif +#endif + cmp %ebx, %ecx + jae L(128bytesormore_nt_start) + + +#ifdef DATA_CACHE_SIZE + POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) + cmp $DATA_CACHE_SIZE, %ecx +#else +# ifdef SHARED +# define RESTORE_EBX_STATE + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx +# else + POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) + cmp __x86_data_cache_size, %ecx +# endif +#endif + + jae L(128bytes_L2_normal) + subl $128, %ecx +L(128bytesormore_normal): + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jb L(128bytesless_normal) + + + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jae L(128bytesormore_normal) + +L(128bytesless_normal): + add $128, %ecx + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytes_L2_normal): + prefetcht0 0x380(%edx) + prefetcht0 0x3c0(%edx) + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm0, 0x10(%edx) + movaps %xmm0, 0x20(%edx) + movaps %xmm0, 0x30(%edx) + movaps %xmm0, 0x40(%edx) + movaps %xmm0, 0x50(%edx) + movaps %xmm0, 0x60(%edx) + movaps %xmm0, 0x70(%edx) + add $128, %edx + cmp $128, %ecx + jae L(128bytes_L2_normal) + +L(128bytesless_L2_normal): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + RESTORE_EBX_STATE +L(128bytesormore_nt_start): + sub %ebx, %ecx + ALIGN (4) +L(128bytesormore_shared_cache_loop): + prefetcht0 0x3c0(%edx) + prefetcht0 0x380(%edx) + sub $0x80, %ebx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + add $0x80, %edx + cmp $0x80, %ebx + jae L(128bytesormore_shared_cache_loop) + cmp $0x80, %ecx + jb L(shared_cache_loop_end) + ALIGN (4) +L(128bytesormore_nt): + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm0, 0x10(%edx) + movntdq %xmm0, 0x20(%edx) + movntdq %xmm0, 0x30(%edx) + movntdq %xmm0, 0x40(%edx) + movntdq %xmm0, 0x50(%edx) + movntdq %xmm0, 0x60(%edx) + movntdq %xmm0, 0x70(%edx) + add $0x80, %edx + cmp $0x80, %ecx + jae L(128bytesormore_nt) + sfence +L(shared_cache_loop_end): +#if defined DATA_CACHE_SIZE || !defined SHARED + POP (%ebx) +#endif + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_16_128bytes): + .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) + .popsection + + ALIGN (4) +L(aligned_16_112bytes): + movdqa %xmm0, -112(%edx) +L(aligned_16_96bytes): + movdqa %xmm0, -96(%edx) +L(aligned_16_80bytes): + movdqa %xmm0, -80(%edx) +L(aligned_16_64bytes): + movdqa %xmm0, -64(%edx) +L(aligned_16_48bytes): + movdqa %xmm0, -48(%edx) +L(aligned_16_32bytes): + movdqa %xmm0, -32(%edx) +L(aligned_16_16bytes): + movdqa %xmm0, -16(%edx) +L(aligned_16_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_113bytes): + movdqa %xmm0, -113(%edx) +L(aligned_16_97bytes): + movdqa %xmm0, -97(%edx) +L(aligned_16_81bytes): + movdqa %xmm0, -81(%edx) +L(aligned_16_65bytes): + movdqa %xmm0, -65(%edx) +L(aligned_16_49bytes): + movdqa %xmm0, -49(%edx) +L(aligned_16_33bytes): + movdqa %xmm0, -33(%edx) +L(aligned_16_17bytes): + movdqa %xmm0, -17(%edx) +L(aligned_16_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_114bytes): + movdqa %xmm0, -114(%edx) +L(aligned_16_98bytes): + movdqa %xmm0, -98(%edx) +L(aligned_16_82bytes): + movdqa %xmm0, -82(%edx) +L(aligned_16_66bytes): + movdqa %xmm0, -66(%edx) +L(aligned_16_50bytes): + movdqa %xmm0, -50(%edx) +L(aligned_16_34bytes): + movdqa %xmm0, -34(%edx) +L(aligned_16_18bytes): + movdqa %xmm0, -18(%edx) +L(aligned_16_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_115bytes): + movdqa %xmm0, -115(%edx) +L(aligned_16_99bytes): + movdqa %xmm0, -99(%edx) +L(aligned_16_83bytes): + movdqa %xmm0, -83(%edx) +L(aligned_16_67bytes): + movdqa %xmm0, -67(%edx) +L(aligned_16_51bytes): + movdqa %xmm0, -51(%edx) +L(aligned_16_35bytes): + movdqa %xmm0, -35(%edx) +L(aligned_16_19bytes): + movdqa %xmm0, -19(%edx) +L(aligned_16_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_116bytes): + movdqa %xmm0, -116(%edx) +L(aligned_16_100bytes): + movdqa %xmm0, -100(%edx) +L(aligned_16_84bytes): + movdqa %xmm0, -84(%edx) +L(aligned_16_68bytes): + movdqa %xmm0, -68(%edx) +L(aligned_16_52bytes): + movdqa %xmm0, -52(%edx) +L(aligned_16_36bytes): + movdqa %xmm0, -36(%edx) +L(aligned_16_20bytes): + movdqa %xmm0, -20(%edx) +L(aligned_16_4bytes): + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_117bytes): + movdqa %xmm0, -117(%edx) +L(aligned_16_101bytes): + movdqa %xmm0, -101(%edx) +L(aligned_16_85bytes): + movdqa %xmm0, -85(%edx) +L(aligned_16_69bytes): + movdqa %xmm0, -69(%edx) +L(aligned_16_53bytes): + movdqa %xmm0, -53(%edx) +L(aligned_16_37bytes): + movdqa %xmm0, -37(%edx) +L(aligned_16_21bytes): + movdqa %xmm0, -21(%edx) +L(aligned_16_5bytes): + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_118bytes): + movdqa %xmm0, -118(%edx) +L(aligned_16_102bytes): + movdqa %xmm0, -102(%edx) +L(aligned_16_86bytes): + movdqa %xmm0, -86(%edx) +L(aligned_16_70bytes): + movdqa %xmm0, -70(%edx) +L(aligned_16_54bytes): + movdqa %xmm0, -54(%edx) +L(aligned_16_38bytes): + movdqa %xmm0, -38(%edx) +L(aligned_16_22bytes): + movdqa %xmm0, -22(%edx) +L(aligned_16_6bytes): + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_119bytes): + movdqa %xmm0, -119(%edx) +L(aligned_16_103bytes): + movdqa %xmm0, -103(%edx) +L(aligned_16_87bytes): + movdqa %xmm0, -87(%edx) +L(aligned_16_71bytes): + movdqa %xmm0, -71(%edx) +L(aligned_16_55bytes): + movdqa %xmm0, -55(%edx) +L(aligned_16_39bytes): + movdqa %xmm0, -39(%edx) +L(aligned_16_23bytes): + movdqa %xmm0, -23(%edx) +L(aligned_16_7bytes): + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_120bytes): + movdqa %xmm0, -120(%edx) +L(aligned_16_104bytes): + movdqa %xmm0, -104(%edx) +L(aligned_16_88bytes): + movdqa %xmm0, -88(%edx) +L(aligned_16_72bytes): + movdqa %xmm0, -72(%edx) +L(aligned_16_56bytes): + movdqa %xmm0, -56(%edx) +L(aligned_16_40bytes): + movdqa %xmm0, -40(%edx) +L(aligned_16_24bytes): + movdqa %xmm0, -24(%edx) +L(aligned_16_8bytes): + movq %xmm0, -8(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_121bytes): + movdqa %xmm0, -121(%edx) +L(aligned_16_105bytes): + movdqa %xmm0, -105(%edx) +L(aligned_16_89bytes): + movdqa %xmm0, -89(%edx) +L(aligned_16_73bytes): + movdqa %xmm0, -73(%edx) +L(aligned_16_57bytes): + movdqa %xmm0, -57(%edx) +L(aligned_16_41bytes): + movdqa %xmm0, -41(%edx) +L(aligned_16_25bytes): + movdqa %xmm0, -25(%edx) +L(aligned_16_9bytes): + movq %xmm0, -9(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_122bytes): + movdqa %xmm0, -122(%edx) +L(aligned_16_106bytes): + movdqa %xmm0, -106(%edx) +L(aligned_16_90bytes): + movdqa %xmm0, -90(%edx) +L(aligned_16_74bytes): + movdqa %xmm0, -74(%edx) +L(aligned_16_58bytes): + movdqa %xmm0, -58(%edx) +L(aligned_16_42bytes): + movdqa %xmm0, -42(%edx) +L(aligned_16_26bytes): + movdqa %xmm0, -26(%edx) +L(aligned_16_10bytes): + movq %xmm0, -10(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_123bytes): + movdqa %xmm0, -123(%edx) +L(aligned_16_107bytes): + movdqa %xmm0, -107(%edx) +L(aligned_16_91bytes): + movdqa %xmm0, -91(%edx) +L(aligned_16_75bytes): + movdqa %xmm0, -75(%edx) +L(aligned_16_59bytes): + movdqa %xmm0, -59(%edx) +L(aligned_16_43bytes): + movdqa %xmm0, -43(%edx) +L(aligned_16_27bytes): + movdqa %xmm0, -27(%edx) +L(aligned_16_11bytes): + movq %xmm0, -11(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_124bytes): + movdqa %xmm0, -124(%edx) +L(aligned_16_108bytes): + movdqa %xmm0, -108(%edx) +L(aligned_16_92bytes): + movdqa %xmm0, -92(%edx) +L(aligned_16_76bytes): + movdqa %xmm0, -76(%edx) +L(aligned_16_60bytes): + movdqa %xmm0, -60(%edx) +L(aligned_16_44bytes): + movdqa %xmm0, -44(%edx) +L(aligned_16_28bytes): + movdqa %xmm0, -28(%edx) +L(aligned_16_12bytes): + movq %xmm0, -12(%edx) + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_125bytes): + movdqa %xmm0, -125(%edx) +L(aligned_16_109bytes): + movdqa %xmm0, -109(%edx) +L(aligned_16_93bytes): + movdqa %xmm0, -93(%edx) +L(aligned_16_77bytes): + movdqa %xmm0, -77(%edx) +L(aligned_16_61bytes): + movdqa %xmm0, -61(%edx) +L(aligned_16_45bytes): + movdqa %xmm0, -45(%edx) +L(aligned_16_29bytes): + movdqa %xmm0, -29(%edx) +L(aligned_16_13bytes): + movq %xmm0, -13(%edx) + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_126bytes): + movdqa %xmm0, -126(%edx) +L(aligned_16_110bytes): + movdqa %xmm0, -110(%edx) +L(aligned_16_94bytes): + movdqa %xmm0, -94(%edx) +L(aligned_16_78bytes): + movdqa %xmm0, -78(%edx) +L(aligned_16_62bytes): + movdqa %xmm0, -62(%edx) +L(aligned_16_46bytes): + movdqa %xmm0, -46(%edx) +L(aligned_16_30bytes): + movdqa %xmm0, -30(%edx) +L(aligned_16_14bytes): + movq %xmm0, -14(%edx) + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_127bytes): + movdqa %xmm0, -127(%edx) +L(aligned_16_111bytes): + movdqa %xmm0, -111(%edx) +L(aligned_16_95bytes): + movdqa %xmm0, -95(%edx) +L(aligned_16_79bytes): + movdqa %xmm0, -79(%edx) +L(aligned_16_63bytes): + movdqa %xmm0, -63(%edx) +L(aligned_16_47bytes): + movdqa %xmm0, -47(%edx) +L(aligned_16_31bytes): + movdqa %xmm0, -31(%edx) +L(aligned_16_15bytes): + movq %xmm0, -15(%edx) + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN_END + +END (__memset_sse2) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S new file mode 100644 index 0000000000..f601663a9f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S @@ -0,0 +1,75 @@ +/* Multiple versions of memset + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(memset) + .type memset, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memset_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_sse2) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_sse2_rep) +2: ret +END(memset) + +# undef ENTRY +# define ENTRY(name) \ + .type __memset_ia32, @function; \ + .globl __memset_ia32; \ + .p2align 4; \ + __memset_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memset_ia32, .-__memset_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memset_chk_ia32, @function; \ + .globl __memset_chk_ia32; \ + .p2align 4; \ + __memset_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_ia32 +# endif + +# undef strong_alias +# define strong_alias(original, alias) +#endif + +#include "../memset.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S new file mode 100644 index 0000000000..573cf4208a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S @@ -0,0 +1,82 @@ +/* Multiple versions of __memset_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(__memset_chk) + .type __memset_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memset_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_chk_sse2) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep) +2: ret +END(__memset_chk) + +# ifdef SHARED +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +# else + .text + .type __memset_chk_sse2, @function + .p2align 4; +__memset_chk_sse2: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_sse2 + cfi_endproc + .size __memset_chk_sse2, .-__memset_chk_sse2 + + .type __memset_chk_sse2_rep, @function + .p2align 4; +__memset_chk_sse2_rep: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_sse2_rep + cfi_endproc + .size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep + + .type __memset_chk_ia32, @function + .p2align 4; +__memset_chk_ia32: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_ia32 + cfi_endproc + .size __memset_chk_ia32, .-__memset_chk_ia32 +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S new file mode 100644 index 0000000000..88c0e5776c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S @@ -0,0 +1,3 @@ +#define USE_AS_RAWMEMCHR +#define MEMCHR __rawmemchr_sse2_bsf +#include "memchr-sse2-bsf.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S new file mode 100644 index 0000000000..038c74896b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_RAWMEMCHR +#define MEMCHR __rawmemchr_sse2 +#include "memchr-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S new file mode 100644 index 0000000000..0a41d63ee8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S @@ -0,0 +1,65 @@ +/* Multiple versions of rawmemchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__rawmemchr) + .type __rawmemchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX (__rawmemchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__rawmemchr_ia32) + ret + +3: LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf) + ret +END(__rawmemchr) + +weak_alias(__rawmemchr, rawmemchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __rawmemchr_ia32, @function; \ + .globl __rawmemchr_ia32; \ + .p2align 4; \ + __rawmemchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32 + +# undef libc_hidden_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_def(name) \ + .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32 + +#endif +#include "../../rawmemchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c new file mode 100644 index 0000000000..1aa5440644 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c @@ -0,0 +1 @@ +#include <string/strnlen.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c new file mode 100644 index 0000000000..2e9619f97c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c @@ -0,0 +1,27 @@ +/* FMA version of fma. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +double +__fma_fma (double x, double y, double z) +{ + asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c new file mode 100644 index 0000000000..411ebb2ba9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c @@ -0,0 +1,34 @@ +/* Multiple versions of fma. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <math.h> +#include <init-arch.h> + +extern double __fma_ia32 (double x, double y, double z) attribute_hidden; +extern double __fma_fma (double x, double y, double z) attribute_hidden; + +libm_ifunc (__fma, + HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma : __fma_ia32); +weak_alias (__fma, fma) + +#define __fma __fma_ia32 + +#include <sysdeps/ieee754/ldbl-96/s_fma.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c new file mode 100644 index 0000000000..ee57abfda2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c @@ -0,0 +1,27 @@ +/* FMA version of fmaf. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +float +__fmaf_fma (float x, float y, float z) +{ + asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c new file mode 100644 index 0000000000..00b0fbcfc5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c @@ -0,0 +1,34 @@ +/* Multiple versions of fmaf. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <math.h> +#include <init-arch.h> + +extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden; +extern float __fmaf_fma (float x, float y, float z) attribute_hidden; + +libm_ifunc (__fmaf, + HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma : __fmaf_ia32); +weak_alias (__fmaf, fmaf) + +#define __fmaf __fmaf_ia32 + +#include <sysdeps/ieee754/dbl-64/s_fmaf.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c new file mode 100644 index 0000000000..7db31b02f8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/sched_cpucount.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S new file mode 100644 index 0000000000..46ca1b3074 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_sse2 +#include "strcpy-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S new file mode 100644 index 0000000000..d971c2da38 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S new file mode 100644 index 0000000000..ee81ab6ae3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S @@ -0,0 +1,9 @@ +/* Multiple versions of stpcpy + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S new file mode 100644 index 0000000000..37a703cb76 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_sse2 +#include "strcpy-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S new file mode 100644 index 0000000000..14ed16f6b5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S new file mode 100644 index 0000000000..2698ca6a8c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S @@ -0,0 +1,8 @@ +/* Multiple versions of stpncpy + All versions must be listed in ifunc-impl-list.c. */ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c new file mode 100644 index 0000000000..753c6ec84a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c @@ -0,0 +1,12 @@ +#include <string.h> + +extern __typeof (strcasecmp) __strcasecmp_nonascii; + +#define __strcasecmp __strcasecmp_nonascii +#include <string/strcasecmp.c> + +strong_alias (__strcasecmp_nonascii, __strcasecmp_ia32) + +/* The needs of strcasecmp in libc are minimal, no need to go through + the IFUNC. */ +strong_alias (__strcasecmp_nonascii, __GI___strcasecmp) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S new file mode 100644 index 0000000000..ec59276408 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S @@ -0,0 +1,39 @@ +/* Entry point for multi-version x86 strcasecmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY(__strcasecmp) + .type __strcasecmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strcasecmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2) +2: ret +END(__strcasecmp) + +weak_alias (__strcasecmp, strcasecmp) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c new file mode 100644 index 0000000000..d4fcd2b4a1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c @@ -0,0 +1,13 @@ +#include <string.h> + +extern __typeof (strcasecmp_l) __strcasecmp_l_nonascii; + +#define __strcasecmp_l __strcasecmp_l_nonascii +#define USE_IN_EXTENDED_LOCALE_MODEL 1 +#include <string/strcasecmp.c> + +strong_alias (__strcasecmp_l_nonascii, __strcasecmp_l_ia32) + +/* The needs of strcasecmp in libc are minimal, no need to go through + the IFUNC. */ +strong_alias (__strcasecmp_l_nonascii, __GI___strcasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S new file mode 100644 index 0000000000..411d4153f2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S @@ -0,0 +1,2 @@ +#define USE_AS_STRCASECMP_L 1 +#include "strcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S new file mode 100644 index 0000000000..a22b93c518 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S @@ -0,0 +1,2 @@ +#define USE_AS_STRCASECMP_L 1 +#include "strcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S new file mode 100644 index 0000000000..711c09b0dc --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S @@ -0,0 +1,7 @@ +/* Multiple versions of strcasecmp_l + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP __strcasecmp_l +#define USE_AS_STRCASECMP_L +#include "strcmp.S" + +weak_alias (__strcasecmp_l, strcasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S new file mode 100644 index 0000000000..6359c7330c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S @@ -0,0 +1,1245 @@ +/* strcat with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# include <sysdep.h> + + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + SETUP_PIC_REG(cx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjusted ECX. Go. */ \ + jmp *%ecx +# else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + +# ifndef STRCAT +# define STRCAT __strcat_sse2 +# endif + +# define PARMS 4 +# define STR1 PARMS+4 +# define STR2 STR1+4 + +# ifdef USE_AS_STRNCAT +# define LEN STR2+8 +# define STR3 STR1+4 +# else +# define STR3 STR1 +# endif + +# define USE_AS_STRCAT +# ifdef USE_AS_STRNCAT +# define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi); +# else +# define RETURN POP(%esi); ret; CFI_PUSH(%esi); +# endif + +.text +ENTRY (STRCAT) + PUSH (%esi) + mov STR1(%esp), %eax + mov STR2(%esp), %esi +# ifdef USE_AS_STRNCAT + PUSH (%ebx) + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) +# endif + cmpb $0, (%esi) + mov %esi, %ecx + mov %eax, %edx + jz L(ExitZero) + + and $63, %ecx + and $63, %edx + cmp $32, %ecx + ja L(StrlenCore7_1) + cmp $48, %edx + ja L(alignment_prolog) + + pxor %xmm0, %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm7, %xmm7 + movdqu (%eax), %xmm1 + movdqu (%esi), %xmm5 + pcmpeqb %xmm1, %xmm0 + movdqu 16(%esi), %xmm6 + pmovmskb %xmm0, %ecx + pcmpeqb %xmm5, %xmm4 + pcmpeqb %xmm6, %xmm7 + test %ecx, %ecx + jnz L(exit_less16_) + mov %eax, %ecx + and $-16, %eax + jmp L(loop_prolog) + +L(alignment_prolog): + pxor %xmm0, %xmm0 + pxor %xmm4, %xmm4 + mov %edx, %ecx + pxor %xmm7, %xmm7 + and $15, %ecx + and $-16, %eax + pcmpeqb (%eax), %xmm0 + movdqu (%esi), %xmm5 + movdqu 16(%esi), %xmm6 + pmovmskb %xmm0, %edx + pcmpeqb %xmm5, %xmm4 + shr %cl, %edx + pcmpeqb %xmm6, %xmm7 + test %edx, %edx + jnz L(exit_less16) + add %eax, %ecx + + pxor %xmm0, %xmm0 +L(loop_prolog): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + .p2align 4 +L(align16_loop): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop) + bsf %edx, %edx + add %edx, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit16): + bsf %edx, %edx + lea 16(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit32): + bsf %edx, %edx + lea 32(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit48): + bsf %edx, %edx + lea 48(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_less16): + bsf %edx, %edx + add %ecx, %eax + add %edx, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_less16_): + bsf %ecx, %ecx + add %ecx, %eax + + .p2align 4 +L(StartStrcpyPart): + pmovmskb %xmm4, %edx +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + movdqu %xmm5, (%eax) + pmovmskb %xmm7, %edx +# ifdef USE_AS_STRNCAT + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + mov %esi, %ecx + and $-16, %esi + and $15, %ecx + pxor %xmm0, %xmm0 +# ifdef USE_AS_STRNCAT + add %ecx, %ebx + sbb %edx, %edx + or %edx, %ebx +# endif + sub %ecx, %eax + jmp L(Unalign16Both) + +L(StrlenCore7_1): + mov %eax, %ecx + pxor %xmm0, %xmm0 + and $15, %ecx + and $-16, %eax + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + shr %cl, %edx + test %edx, %edx + jnz L(exit_less16_1) + add %eax, %ecx + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + .p2align 4 +L(align16_loop_1): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16_1) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32_1) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48_1) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop_1) + bsf %edx, %edx + add %edx, %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit16_1): + bsf %edx, %edx + lea 16(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit32_1): + bsf %edx, %edx + lea 32(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit48_1): + bsf %edx, %edx + lea 48(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit_less16_1): + bsf %edx, %edx + add %ecx, %eax + add %edx, %eax + + .p2align 4 +L(StartStrcpyPart_1): + mov %esi, %ecx + and $15, %ecx + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +# ifdef USE_AS_STRNCAT + cmp $48, %ebx + ja L(BigN) +# endif + pcmpeqb (%esi), %xmm1 +# ifdef USE_AS_STRNCAT + add %ecx, %ebx +# endif + pmovmskb %xmm1, %edx + shr %cl, %edx +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STRNCAT + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%eax) + sub %ecx, %eax + + .p2align 4 +L(Unalign16Both): + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +L(Unalign16BothBigN): + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%eax, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%eax, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%eax, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%eax, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm3, (%eax, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %eax +# ifdef USE_AS_STRNCAT + lea 128(%ebx, %edx), %ebx +# endif + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +# ifdef USE_AS_STRNCAT + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jnz L(Unaligned64Leave) + + .p2align 4 +L(Unaligned64Loop_start): + add $64, %eax + add $64, %esi + movdqu %xmm4, -64(%eax) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%eax) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%eax) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%eax) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +# ifdef USE_AS_STRNCAT + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%eax) + movdqu %xmm5, 16(%eax) + movdqu %xmm6, 32(%eax) + add $48, %esi + add $48, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +# ifdef USE_AS_STRNCAT + .p2align 4 +L(BigN): + pcmpeqb (%esi), %xmm1 + pmovmskb %xmm1, %edx + shr %cl, %edx + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%eax) + sub %ecx, %eax + sub $48, %ebx + add %ecx, %ebx + + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + jmp L(Unalign16BothBigN) +# endif + +/*------------end of main part-------------------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16Bytes): + add %ecx, %eax + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTail): + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %eax +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%eax) + add $16, %esi + add $16, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%eax) + movdqu %xmm5, 16(%eax) + add $32, %esi + add $32, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +# ifdef USE_AS_STRNCAT + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %eax + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %eax + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %eax + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +# endif + +# ifdef USE_AS_STRNCAT + .p2align 4 +L(StrncatExit0): + movb %bh, (%eax) + mov STR3(%esp), %eax + RETURN +# endif + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit1): + movb %bh, 1(%eax) +# endif +L(Exit1): +# ifdef USE_AS_STRNCAT + movb (%esi), %dh +# endif + movb %dh, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit2): + movb %bh, 2(%eax) +# endif +L(Exit2): + movw (%esi), %dx + movw %dx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit3): + movb %bh, 3(%eax) +# endif +L(Exit3): + movw (%esi), %cx + movw %cx, (%eax) +# ifdef USE_AS_STRNCAT + movb 2(%esi), %dh +# endif + movb %dh, 2(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit4): + movb %bh, 4(%eax) +# endif +L(Exit4): + movl (%esi), %edx + movl %edx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit5): + movb %bh, 5(%eax) +# endif +L(Exit5): + movl (%esi), %ecx +# ifdef USE_AS_STRNCAT + movb 4(%esi), %dh +# endif + movb %dh, 4(%eax) + movl %ecx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit6): + movb %bh, 6(%eax) +# endif +L(Exit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%eax) + movw %dx, 4(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit7): + movb %bh, 7(%eax) +# endif +L(Exit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%eax) + movl %edx, 3(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit8): + movb %bh, 8(%eax) +# endif +L(Exit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit9): + movb %bh, 9(%eax) +# endif +L(Exit9): + movlpd (%esi), %xmm0 +# ifdef USE_AS_STRNCAT + movb 8(%esi), %dh +# endif + movb %dh, 8(%eax) + movlpd %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit10): + movb %bh, 10(%eax) +# endif +L(Exit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%eax) + movw %dx, 8(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit11): + movb %bh, 11(%eax) +# endif +L(Exit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%eax) + movl %edx, 7(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit12): + movb %bh, 12(%eax) +# endif +L(Exit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%eax) + movl %edx, 8(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit13): + movb %bh, 13(%eax) +# endif +L(Exit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 5(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit14): + movb %bh, 14(%eax) +# endif +L(Exit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 6(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit15): + movb %bh, 15(%eax) +# endif +L(Exit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 7(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit16): + movb %bh, 16(%eax) +# endif +L(Exit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit17): + movb %bh, 17(%eax) +# endif +L(Exit17): + movdqu (%esi), %xmm0 +# ifdef USE_AS_STRNCAT + movb 16(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movb %dh, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit18): + movb %bh, 18(%eax) +# endif +L(Exit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%eax) + movw %cx, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit19): + movb %bh, 19(%eax) +# endif +L(Exit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%eax) + movl %ecx, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit20): + movb %bh, 20(%eax) +# endif +L(Exit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%eax) + movl %ecx, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit21): + movb %bh, 21(%eax) +# endif +L(Exit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx +# ifdef USE_AS_STRNCAT + movb 20(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movl %ecx, 16(%eax) + movb %dh, 20(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit22): + movb %bh, 22(%eax) +# endif +L(Exit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%eax) + movlpd %xmm3, 14(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit23): + movb %bh, 23(%eax) +# endif +L(Exit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%eax) + movlpd %xmm3, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit24): + movb %bh, 24(%eax) +# endif +L(Exit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit25): + movb %bh, 25(%eax) +# endif +L(Exit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 +# ifdef USE_AS_STRNCAT + movb 24(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movb %dh, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit26): + movb %bh, 26(%eax) +# endif +L(Exit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movw %cx, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit27): + movb %bh, 27(%eax) +# endif +L(Exit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movl %ecx, 23(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit28): + movb %bh, 28(%eax) +# endif +L(Exit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movl %ecx, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit29): + movb %bh, 29(%eax) +# endif +L(Exit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 13(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit30): + movb %bh, 30(%eax) +# endif +L(Exit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 14(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit31): + movb %bh, 31(%eax) +# endif +L(Exit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit32): + movb %bh, 32(%eax) +# endif +L(Exit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 16(%eax) + mov STR3(%esp), %eax + RETURN + +# ifdef USE_AS_STRNCAT + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%eax) + xor %bh, %bh + movb %bh, 64(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm4, (%eax) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm5, 16(%eax) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm6, 32(%eax) + lea 16(%eax, %ecx), %eax + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) +# endif + .p2align 4 +L(ExitZero): + RETURN + +END (STRCAT) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +# ifdef USE_AS_STRNCAT +L(ExitStrncatTable): + .int JMPTBL(L(StrncatExit0), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit1), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit2), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit3), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit4), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit5), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit6), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit7), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit8), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit9), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit10), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit11), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit12), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit13), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit14), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit15), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit16), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit17), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit18), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit19), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit20), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit21), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit22), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit23), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit24), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit25), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit26), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit27), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit28), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit29), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit30), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit31), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit32), L(ExitStrncatTable)) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S new file mode 100644 index 0000000000..59ffbc60a5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S @@ -0,0 +1,572 @@ +/* strcat with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCAT +# define STRCAT __strcat_ssse3 +# endif + +# define PARMS 4 +# define STR1 PARMS+4 +# define STR2 STR1+4 + +# ifdef USE_AS_STRNCAT +# define LEN STR2+8 +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) + PUSH (%edi) + mov STR1(%esp), %edi + mov %edi, %edx + +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-sse2.S" + +L(StartStrcpyPart): + mov STR2(%esp), %ecx + lea (%edi, %eax), %edx +# ifdef USE_AS_STRNCAT + PUSH (%ebx) + mov LEN(%esp), %ebx + test %ebx, %ebx + jz L(StrncatExit0) + cmp $8, %ebx + jbe L(StrncatExit8Bytes) +# endif + cmpb $0, (%ecx) + jz L(Exit1) + cmpb $0, 1(%ecx) + jz L(Exit2) + cmpb $0, 2(%ecx) + jz L(Exit3) + cmpb $0, 3(%ecx) + jz L(Exit4) + cmpb $0, 4(%ecx) + jz L(Exit5) + cmpb $0, 5(%ecx) + jz L(Exit6) + cmpb $0, 6(%ecx) + jz L(Exit7) + cmpb $0, 7(%ecx) + jz L(Exit8) + cmpb $0, 8(%ecx) + jz L(Exit9) +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jb L(StrncatExit15Bytes) +# endif + cmpb $0, 9(%ecx) + jz L(Exit10) + cmpb $0, 10(%ecx) + jz L(Exit11) + cmpb $0, 11(%ecx) + jz L(Exit12) + cmpb $0, 12(%ecx) + jz L(Exit13) + cmpb $0, 13(%ecx) + jz L(Exit14) + cmpb $0, 14(%ecx) + jz L(Exit15) + cmpb $0, 15(%ecx) + jz L(Exit16) +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + je L(StrncatExit16) + +# define RETURN1 \ + POP (%ebx); \ + POP (%edi); \ + ret; \ + CFI_PUSH (%ebx); \ + CFI_PUSH (%edi) +# define USE_AS_STRNCPY +# else +# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) +# endif +# include "strcpy-ssse3.S" + .p2align 4 +L(CopyFrom1To16Bytes): + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit1): + movb %bh, 1(%edx) +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit2): + movb %bh, 2(%edx) +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit3): + movb %bh, 3(%edx) +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit4): + movb %bh, 4(%edx) +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit5): + movb %bh, 5(%edx) +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit6): + movb %bh, 6(%edx) +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit7): + movb %bh, 7(%edx) +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit8): + movb %bh, 8(%edx) +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit9): + movb %bh, 9(%edx) +L(Exit9): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit10): + movb %bh, 10(%edx) +L(Exit10): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit11): + movb %bh, 11(%edx) +L(Exit11): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit12): + movb %bh, 12(%edx) +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit13): + movb %bh, 13(%edx) +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit14): + movb %bh, 14(%edx) +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit15): + movb %bh, 15(%edx) +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit16): + movb %bh, 16(%edx) +L(Exit16): + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + +# ifdef USE_AS_STRNCPY + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %esi, %ecx + lea (%esi, %edx), %esi + lea -9(%ebx), %edx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%esi), %edx + POP (%esi) + jz L(ExitHighCase2) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %ebx + je L(StrncatExit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %ebx + je L(StrncatExit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %ebx + je L(StrncatExit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %ebx + je L(StrncatExit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %ebx + je L(StrncatExit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %ebx + je L(StrncatExit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + xor %cl, %cl + movb %cl, (%eax) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHighCase2): + test $0x01, %ah + jnz L(Exit9) + cmp $9, %ebx + je L(StrncatExit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %ebx + je L(StrncatExit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %ebx + je L(StrncatExit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %ebx + je L(StrncatExit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %ebx + je L(StrncatExit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %ebx + je L(StrncatExit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %ebx + je L(StrncatExit15) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + + CFI_PUSH(%esi) + +L(CopyFrom1To16BytesCase2OrCase3): + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %esi, %edx + add %esi, %ecx + + POP (%esi) + + cmp $8, %ebx + ja L(ExitHighCase3) + cmp $1, %ebx + je L(StrncatExit1) + cmp $2, %ebx + je L(StrncatExit2) + cmp $3, %ebx + je L(StrncatExit3) + cmp $4, %ebx + je L(StrncatExit4) + cmp $5, %ebx + je L(StrncatExit5) + cmp $6, %ebx + je L(StrncatExit6) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb %bh, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHighCase3): + cmp $9, %ebx + je L(StrncatExit9) + cmp $10, %ebx + je L(StrncatExit10) + cmp $11, %ebx + je L(StrncatExit11) + cmp $12, %ebx + je L(StrncatExit12) + cmp $13, %ebx + je L(StrncatExit13) + cmp $14, %ebx + je L(StrncatExit14) + cmp $15, %ebx + je L(StrncatExit15) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + movb %bh, 16(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit0): + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit15Bytes): + cmp $9, %ebx + je L(StrncatExit9) + cmpb $0, 9(%ecx) + jz L(Exit10) + cmp $10, %ebx + je L(StrncatExit10) + cmpb $0, 10(%ecx) + jz L(Exit11) + cmp $11, %ebx + je L(StrncatExit11) + cmpb $0, 11(%ecx) + jz L(Exit12) + cmp $12, %ebx + je L(StrncatExit12) + cmpb $0, 12(%ecx) + jz L(Exit13) + cmp $13, %ebx + je L(StrncatExit13) + cmpb $0, 13(%ecx) + jz L(Exit14) + cmp $14, %ebx + je L(StrncatExit14) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + lea 14(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + movb %bh, (%eax) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit8Bytes): + cmpb $0, (%ecx) + jz L(Exit1) + cmp $1, %ebx + je L(StrncatExit1) + cmpb $0, 1(%ecx) + jz L(Exit2) + cmp $2, %ebx + je L(StrncatExit2) + cmpb $0, 2(%ecx) + jz L(Exit3) + cmp $3, %ebx + je L(StrncatExit3) + cmpb $0, 3(%ecx) + jz L(Exit4) + cmp $4, %ebx + je L(StrncatExit4) + cmpb $0, 4(%ecx) + jz L(Exit5) + cmp $5, %ebx + je L(StrncatExit5) + cmpb $0, 5(%ecx) + jz L(Exit6) + cmp $6, %ebx + je L(StrncatExit6) + cmpb $0, 6(%ecx) + jz L(Exit7) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + movb %bh, (%eax) + movl %edi, %eax + RETURN1 + +# endif +END (STRCAT) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S new file mode 100644 index 0000000000..8412cb6f23 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S @@ -0,0 +1,92 @@ +/* Multiple versions of strcat + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef USE_AS_STRNCAT +# ifndef STRCAT +# define STRCAT strcat +# endif +#endif + +#ifdef USE_AS_STRNCAT +# define STRCAT_SSSE3 __strncat_ssse3 +# define STRCAT_SSE2 __strncat_sse2 +# define STRCAT_IA32 __strncat_ia32 +# define __GI_STRCAT __GI_strncat +#else +# define STRCAT_SSSE3 __strcat_ssse3 +# define STRCAT_SSE2 __strcat_sse2 +# define STRCAT_IA32 __strcat_ia32 +# define __GI_STRCAT __GI_strcat +#endif + + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncat in static library since we + need strncat before the initialization happened. */ +#if IS_IN (libc) + + .text +ENTRY(STRCAT) + .type STRCAT, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (STRCAT_IA32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (STRCAT_SSE2) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (STRCAT_SSSE3) +2: ret +END(STRCAT) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCAT_IA32, @function; \ + .align 16; \ + .globl STRCAT_IA32; \ + .hidden STRCAT_IA32; \ + STRCAT_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcat calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32 + +# endif +#endif + +#ifndef USE_AS_STRNCAT +# include "../../strcat.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S new file mode 100644 index 0000000000..95fd7c084e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S @@ -0,0 +1,158 @@ +/* strchr with SSE2 with bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi) +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + .text +ENTRY (__strchr_sse2_bsf) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $15, %ecx + pshufd $0, %xmm1, %xmm1 + je L(loop) + +/* Handle unaligned string. */ + and $-16, %edi + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + je L(unaligned_no_match) + /* Check which byte is a match. */ + bsf %eax, %eax + /* Is there a NULL? */ + test %edx, %edx + je L(unaligned_match) + bsf %edx, %edx + cmpl %edx, %eax + /* Return NULL if NULL comes first. */ + ja L(return_null) +L(unaligned_match): + add %edi, %eax + add %ecx, %eax + RETURN + + .p2align 4 +L(unaligned_no_match): + test %edx, %edx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %edi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + jmp L(loop) + +L(matches): + pmovmskb %xmm2, %edx + test %eax, %eax + jz L(return_null) + bsf %eax, %eax + /* There is a match. First find where NULL is. */ + test %edx, %edx + je L(match) + bsf %edx, %ecx + /* Check if NULL comes first. */ + cmpl %ecx, %eax + ja L(return_null) +L(match): + sub $16, %edi + add %edi, %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + +END (__strchr_sse2_bsf) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S new file mode 100644 index 0000000000..1f9e875b04 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S @@ -0,0 +1,348 @@ +/* strchr SSE2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi) +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__strchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $15, %ecx + pshufd $0, %xmm1, %xmm1 + je L(loop) + +/* Handle unaligned string. */ + and $-16, %edi + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + jz L(unaligned_no_match) + /* Check which byte is a match. */ + /* Is there a NULL? */ + add %ecx, %edi + test %edx, %edx + jz L(match_case1) + jmp L(match_case2) + + .p2align 4 +L(unaligned_no_match): + test %edx, %edx + jne L(return_null) + + pxor %xmm2, %xmm2 + add $16, %edi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + jmp L(loop) + +L(matches): + /* There is a match. First find where NULL is. */ + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_higth_case2) + + mov %al, %cl + and $15, %cl + jnz L(match_case2_4) + + mov %dl, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %al + jnz L(Exit5) + test $0x10, %dl + jnz L(return_null) + test $0x20, %al + jnz L(Exit6) + test $0x20, %dl + jnz L(return_null) + test $0x40, %al + jnz L(Exit7) + test $0x40, %dl + jnz L(return_null) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_4): + test $0x01, %al + jnz L(Exit1) + test $0x01, %dl + jnz L(return_null) + test $0x02, %al + jnz L(Exit2) + test $0x02, %dl + jnz L(return_null) + test $0x04, %al + jnz L(Exit3) + test $0x04, %dl + jnz L(return_null) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_higth_case2): + test %dl, %dl + jnz L(return_null) + + mov %ah, %cl + and $15, %cl + jnz L(match_case2_12) + + mov %dh, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %ah + jnz L(Exit13) + test $0x10, %dh + jnz L(return_null) + test $0x20, %ah + jnz L(Exit14) + test $0x20, %dh + jnz L(return_null) + test $0x40, %ah + jnz L(Exit15) + test $0x40, %dh + jnz L(return_null) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_12): + test $0x01, %ah + jnz L(Exit9) + test $0x01, %dh + jnz L(return_null) + test $0x02, %ah + jnz L(Exit10) + test $0x02, %dh + jnz L(return_null) + test $0x04, %ah + jnz L(Exit11) + test $0x04, %dh + jnz L(return_null) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_higth_case1) + + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_higth_case1): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(Exit1): + lea (%edi), %eax + RETURN + + .p2align 4 +L(Exit2): + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(Exit3): + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(Exit4): + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(Exit5): + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(Exit6): + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(Exit7): + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(Exit9): + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(Exit10): + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(Exit11): + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(Exit12): + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(Exit13): + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(Exit14): + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(Exit15): + lea 14(%edi), %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + +END (__strchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S new file mode 100644 index 0000000000..5b97b1c767 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S @@ -0,0 +1,57 @@ +/* Multiple versions of strchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(strchr) + .type strchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strchr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf) + HAS_ARCH_FEATURE (Slow_BSF) + jz 2f + LOAD_FUNC_GOT_EAX (__strchr_sse2) +2: ret +END(strchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __strchr_ia32, @function; \ + .globl __strchr_ia32; \ + .p2align 4; \ + __strchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strchr_ia32, .-__strchr_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strchr; __GI_strchr = __strchr_ia32 +#endif + +#include "../../i586/strchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S new file mode 100644 index 0000000000..cd26058671 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -0,0 +1,804 @@ +/* strcmp with SSE4.2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strncmp_sse4_2 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 +# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM) +# define REM %ebp +#elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strcasecmp_l_sse4_2 +# endif +# ifdef PIC +# define STR1 12 +# else +# define STR1 8 +# endif +# define STR2 STR1+4 +# define LOCALE 12 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (%edi); POP (%ebx); ret; \ + .p2align 4; CFI_PUSH (%ebx); CFI_PUSH (%edi) +# else +# define RETURN POP (%edi); ret; .p2align 4; CFI_PUSH (%edi) +# endif +# define NONASCII __strcasecmp_nonascii +#elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strncasecmp_l_sse4_2 +# endif +# ifdef PIC +# define STR1 16 +# else +# define STR1 12 +# endif +# define STR2 STR1+4 +# define CNT STR2+4 +# define LOCALE 16 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (%edi); POP (REM); POP (%ebx); ret; \ + .p2align 4; \ + CFI_PUSH (%ebx); CFI_PUSH (REM); CFI_PUSH (%edi) +# else +# define RETURN POP (%edi); POP (REM); ret; \ + .p2align 4; CFI_PUSH (REM); CFI_PUSH (%edi) +# endif +# define REM %ebp +# define NONASCII __strncasecmp_nonascii +#else +# ifndef STRCMP +# define STRCMP __strcmp_sse4_2 +# endif +# define STR1 4 +# define STR2 STR1+4 +# define RETURN ret; .p2align 4 +#endif + + .section .text.sse4.2,"ax",@progbits + +#ifdef USE_AS_STRCASECMP_L +ENTRY (__strcasecmp_sse4_2) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strcasecmp_nonascii +# else + jne __strcasecmp_nonascii + jmp L(ascii) +# endif +END (__strcasecmp_sse4_2) +#endif + +#ifdef USE_AS_STRNCASECMP_L +ENTRY (__strncasecmp_sse4_2) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strncasecmp_nonascii +# else + jne __strncasecmp_nonascii + jmp L(ascii) +# endif +END (__strncasecmp_sse4_2) +#endif + + ENTRY (STRCMP) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movl LOCALE(%esp), %eax +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) + jne NONASCII + +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) +# endif +L(ascii): + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.Lbelowupper: + .quad 0x4040404040404040 + .quad 0x4040404040404040 +.Ltopupper: + .quad 0x5b5b5b5b5b5b5b5b + .quad 0x5b5b5b5b5b5b5b5b +.Ltouppermask: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + +# ifdef PIC +# define UCLOW_reg .Lbelowupper@GOTOFF(%ebx) +# define UCHIGH_reg .Ltopupper@GOTOFF(%ebx) +# define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx) +# else +# define UCLOW_reg .Lbelowupper +# define UCHIGH_reg .Ltopupper +# define LCQWORD_reg .Ltouppermask +# endif +#endif + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + PUSH (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + PUSH (%edi) +#endif + mov STR1(%esp), %edx + mov STR2(%esp), %eax +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + movl CNT(%esp), REM + test REM, REM + je L(eq) +#endif + mov %dx, %cx + and $0xfff, %cx + cmp $0xff0, %cx + ja L(first4bytes) + movdqu (%edx), %xmm2 + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(first4bytes) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa reg1, %xmm3; \ + movdqa UCHIGH_reg, %xmm4; \ + movdqa reg2, %xmm5; \ + movdqa UCHIGH_reg, %xmm6; \ + pcmpgtb UCLOW_reg, %xmm3; \ + pcmpgtb reg1, %xmm4; \ + pcmpgtb UCLOW_reg, %xmm5; \ + pcmpgtb reg2, %xmm6; \ + pand %xmm4, %xmm3; \ + pand %xmm6, %xmm5; \ + pand LCQWORD_reg, %xmm3; \ + pand LCQWORD_reg, %xmm5; \ + por %xmm3, reg1; \ + por %xmm5, reg2 + + movdqu (%eax), %xmm1 + TOLOWER (%xmm2, %xmm1) + movd %xmm2, %ecx + movd %xmm1, %edi + movdqa %xmm2, %xmm3 + movdqa %xmm1, %xmm4 + cmpl %edi, %ecx +#else +# define TOLOWER(reg1, reg) + + movd %xmm2, %ecx + cmp (%eax), %ecx +#endif + jne L(less4bytes) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + movdqu (%eax), %xmm1 +#endif + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm0 + ptest %xmm1, %xmm0 + jnc L(less16bytes) + pcmpeqb %xmm0, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, REM + jbe L(eq) +#endif + add $16, %edx + add $16, %eax +L(first4bytes): + movzbl (%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl (%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, (%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + je L(eq) +#endif + + movzbl 1(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 1(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 1(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + je L(eq) +#endif + movzbl 2(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 2(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 2(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + je L(eq) +#endif + movzbl 3(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 3(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 3(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + je L(eq) +#endif + movzbl 4(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 4(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 4(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + je L(eq) +#endif + movzbl 5(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 5(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 5(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + je L(eq) +#endif + movzbl 6(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 6(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 6(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + je L(eq) +#endif + movzbl 7(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 7(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 7(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $8, REM + je L(eq) +#endif + add $8, %eax + add $8, %edx + +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + PUSH (%edi) +#endif + PUSH (%esi) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cfi_remember_state +#endif + mov %edx, %edi + mov %eax, %esi + xorl %eax, %eax +L(check_offset): + movl %edi, %edx + movl %esi, %ecx + andl $0xfff, %edx + andl $0xfff, %ecx + cmpl %edx, %ecx + cmovl %edx, %ecx + lea -0xff0(%ecx), %edx + sub %edx, %edi + sub %edx, %esi + testl %edx, %edx + jg L(crosspage) +L(loop): + movdqu (%esi,%edx), %xmm2 + movdqu (%edi,%edx), %xmm1 + TOLOWER (%xmm2, %xmm1) + pcmpistri $0x1a, %xmm2, %xmm1 + jbe L(end) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, REM + jbe L(more16byteseq) +#endif + + add $16, %edx + jle L(loop) +L(crosspage): + movzbl (%edi,%edx), %eax + movzbl (%esi,%edx), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx +# endif +#endif + subl %ecx, %eax + jne L(ret) + testl %ecx, %ecx + je L(ret) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $1, REM + jbe L(more16byteseq) +#endif + inc %edx + cmp $15, %edx + jle L(crosspage) + add %edx, %edi + add %edx, %esi + jmp L(check_offset) + + .p2align 4 +L(end): + jnc L(ret) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %ecx, REM + jbe L(more16byteseq) +#endif + lea (%ecx,%edx), %ecx + movzbl (%edi,%ecx), %eax + movzbl (%esi,%ecx), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx +# endif +#endif + subl %ecx, %eax +L(ret): + POP (%esi) + POP (%edi) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + POP (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + POP (%ebx) +# endif +#endif + ret + + .p2align 4 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cfi_restore_state +L(more16byteseq): + POP (%esi) +# ifdef USE_AS_STRNCMP + POP (%edi) +# endif +#endif +L(eq): + xorl %eax, %eax + RETURN + +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): + RETURN + +L(less16bytes): + add $0xfefefeff, %ecx + jnc L(less4bytes) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movd %xmm3, %edi + xor %edi, %ecx +#else + xor (%edx), %ecx +#endif + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(less4bytes) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + jbe L(eq) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + psrldq $4, %xmm3 + psrldq $4, %xmm4 + movd %xmm3, %ecx + movd %xmm4, %edi + cmp %edi, %ecx + mov %ecx, %edi +#else + mov 4(%edx), %ecx + cmp 4(%eax), %ecx +#endif + jne L(more4bytes) + add $0xfefefeff, %ecx + jnc L(more4bytes) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + xor %edi, %ecx +#else + xor 4(%edx), %ecx +#endif + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(more4bytes) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $8, REM + jbe L(eq) +#endif + + add $8, %edx + add $8, %eax +L(less4bytes): + + movzbl (%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl (%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, (%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + je L(eq) +#endif + movzbl 1(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 1(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 1(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + je L(eq) +#endif + + movzbl 2(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 2(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 2(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + je L(eq) +#endif + movzbl 3(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 3(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 3(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +L(more4bytes): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + je L(eq) +#endif + movzbl 4(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 4(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 4(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + je L(eq) +#endif + movzbl 5(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 5(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 5(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + je L(eq) +#endif + movzbl 6(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 6(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 6(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + je L(eq) +#endif + movzbl 7(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 7(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 7(%edx) +#endif + jne L(neq) + jmp L(eq) + +END (STRCMP) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S new file mode 100644 index 0000000000..b25cc3e068 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S @@ -0,0 +1,2810 @@ +/* strcmp with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strncmp_ssse3 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 +# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM) +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + mov $16, %esi; \ + sub %ecx, %esi; \ + cmp %esi, REM; \ + jbe L(more8byteseq); \ + sub %esi, REM +# define FLAGS %ebx +# define REM %ebp +#elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strcasecmp_l_ssse3 +# endif +# ifdef PIC +# define STR1 8 +# else +# define STR1 4 +# endif +# define STR2 STR1+4 +# define LOCALE 12 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (%ebx); ret; .p2align 4; CFI_PUSH (%ebx) +# else +# define RETURN ret; .p2align 4 +# endif +# define UPDATE_STRNCMP_COUNTER +# define FLAGS (%esp) +# define NONASCII __strcasecmp_nonascii +#elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strncasecmp_l_ssse3 +# endif +# ifdef PIC +# define STR1 12 +# else +# define STR1 8 +# endif +# define STR2 STR1+4 +# define CNT STR2+4 +# define LOCALE 16 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (REM); POP (%ebx); ret; \ + .p2align 4; CFI_PUSH (%ebx); CFI_PUSH (REM) +# else +# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM) +# endif +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + mov $16, %esi; \ + sub %ecx, %esi; \ + cmp %esi, REM; \ + jbe L(more8byteseq); \ + sub %esi, REM +# define FLAGS (%esp) +# define REM %ebp +# define NONASCII __strncasecmp_nonascii +#else +# ifndef STRCMP +# define STRCMP __strcmp_ssse3 +# endif +# define STR1 4 +# define STR2 STR1+4 +# define RETURN ret; .p2align 4 +# define UPDATE_STRNCMP_COUNTER +# define FLAGS %ebx +#endif + + .section .text.ssse3,"ax",@progbits + +#ifdef USE_AS_STRCASECMP_L +ENTRY (__strcasecmp_ssse3) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strcasecmp_nonascii +# else + jne __strcasecmp_nonascii + jmp L(ascii) +# endif +END (__strcasecmp_ssse3) +#endif + +#ifdef USE_AS_STRNCASECMP_L +ENTRY (__strncasecmp_ssse3) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strncasecmp_nonascii +# else + jne __strncasecmp_nonascii + jmp L(ascii) +# endif +END (__strncasecmp_ssse3) +#endif + +ENTRY (STRCMP) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movl LOCALE(%esp), %eax +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) + jne NONASCII + +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) +# endif +L(ascii): + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.Lbelowupper: + .quad 0x4040404040404040 + .quad 0x4040404040404040 +.Ltopupper: + .quad 0x5b5b5b5b5b5b5b5b + .quad 0x5b5b5b5b5b5b5b5b +.Ltouppermask: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + +# ifdef PIC +# define UCLOW_reg .Lbelowupper@GOTOFF(%ebx) +# define UCHIGH_reg .Ltopupper@GOTOFF(%ebx) +# define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx) +# else +# define UCLOW_reg .Lbelowupper +# define UCHIGH_reg .Ltopupper +# define LCQWORD_reg .Ltouppermask +# endif +#endif + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + PUSH (REM) +#endif + + movl STR1(%esp), %edx + movl STR2(%esp), %eax +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + movl CNT(%esp), REM + cmp $16, REM + jb L(less16bytes_sncmp) +#elif !defined USE_AS_STRCASECMP_L + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + add $8, %edx + add $8, %eax +#endif + movl %edx, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(crosspage) + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(crosspage) + pxor %xmm0, %xmm0 + movlpd (%eax), %xmm1 + movlpd (%edx), %xmm2 + movhpd 8(%eax), %xmm1 + movhpd 8(%edx), %xmm2 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa reg1, %xmm5; \ + movdqa reg2, %xmm7; \ + movdqa UCHIGH_reg, %xmm6; \ + pcmpgtb UCLOW_reg, %xmm5; \ + pcmpgtb UCLOW_reg, %xmm7; \ + pcmpgtb reg1, %xmm6; \ + pand %xmm6, %xmm5; \ + movdqa UCHIGH_reg, %xmm6; \ + pcmpgtb reg2, %xmm6; \ + pand %xmm6, %xmm7; \ + pand LCQWORD_reg, %xmm5; \ + por %xmm5, reg1; \ + pand LCQWORD_reg, %xmm7; \ + por %xmm7, reg2 + TOLOWER (%xmm1, %xmm2) +#else +# define TOLOWER(reg1, reg2) +#endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %ecx + sub $0xffff, %ecx + jnz L(less16bytes) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(eq) +#endif + add $16, %eax + add $16, %edx + +L(crosspage): + +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + PUSH (FLAGS) +#endif + PUSH (%edi) + PUSH (%esi) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + pushl $0 + cfi_adjust_cfa_offset (4) +#endif +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cfi_remember_state +#endif + + movl %edx, %edi + movl %eax, %ecx + and $0xf, %ecx + and $0xf, %edi + xor %ecx, %eax + xor %edi, %edx +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + xor FLAGS, FLAGS +#endif + cmp %edi, %ecx + je L(ashr_0) + ja L(bigger) + orl $0x20, FLAGS + xchg %edx, %eax + xchg %ecx, %edi +L(bigger): + lea 15(%edi), %edi + sub %ecx, %edi + cmp $8, %edi + jle L(ashr_less_8) + cmp $14, %edi + je L(ashr_15) + cmp $13, %edi + je L(ashr_14) + cmp $12, %edi + je L(ashr_13) + cmp $11, %edi + je L(ashr_12) + cmp $10, %edi + je L(ashr_11) + cmp $9, %edi + je L(ashr_10) +L(ashr_less_8): + je L(ashr_9) + cmp $7, %edi + je L(ashr_8) + cmp $6, %edi + je L(ashr_7) + cmp $5, %edi + je L(ashr_6) + cmp $4, %edi + je L(ashr_5) + cmp $3, %edi + je L(ashr_4) + cmp $2, %edi + je L(ashr_3) + cmp $1, %edi + je L(ashr_2) + cmp $0, %edi + je L(ashr_1) + +/* + * The following cases will be handled by ashr_0 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +L(ashr_0): + mov $0xffff, %esi + movdqa (%eax), %xmm1 + pxor %xmm0, %xmm0 + pcmpeqb %xmm1, %xmm0 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movdqa (%edx), %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm2, %xmm1 +#else + pcmpeqb (%edx), %xmm1 +#endif + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + mov %ecx, %edi + jne L(less32bytes) + UPDATE_STRNCMP_COUNTER + movl $0x10, FLAGS + mov $0x10, %ecx + pxor %xmm0, %xmm0 + .p2align 4 +L(loop_ashr_0): + movdqa (%eax, %ecx), %xmm1 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movdqa (%edx, %ecx), %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 +#else + pcmpeqb %xmm1, %xmm0 + pcmpeqb (%edx, %ecx), %xmm1 +#endif + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + jmp L(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +L(ashr_1): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $15, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -15(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $1, FLAGS + lea 1(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_1): + add $16, %edi + jg L(nibble_ashr_1) + +L(gobble_ashr_1): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $1, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_1) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $1, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_1) + + .p2align 4 +L(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfffe, %esi + jnz L(ashr_1_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $15, REM + jbe L(ashr_1_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_1) + + .p2align 4 +L(ashr_1_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_2 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +L(ashr_2): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -14(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $2, FLAGS + lea 2(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_2): + add $16, %edi + jg L(nibble_ashr_2) + +L(gobble_ashr_2): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_2) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_2) + + .p2align 4 +L(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfffc, %esi + jnz L(ashr_2_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $14, REM + jbe L(ashr_2_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_2) + + .p2align 4 +L(ashr_2_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_3 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +L(ashr_3): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -13(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $3, FLAGS + lea 3(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_3): + add $16, %edi + jg L(nibble_ashr_3) + +L(gobble_ashr_3): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_3) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_3) + + .p2align 4 +L(nibble_ashr_3): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfff8, %esi + jnz L(ashr_3_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $13, REM + jbe L(ashr_3_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_3) + + .p2align 4 +L(ashr_3_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $3, %xmm0 + psrldq $3, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_4 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +L(ashr_4): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -12(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $4, FLAGS + lea 4(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_4): + add $16, %edi + jg L(nibble_ashr_4) + +L(gobble_ashr_4): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_4) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_4) + + .p2align 4 +L(nibble_ashr_4): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfff0, %esi + jnz L(ashr_4_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $12, REM + jbe L(ashr_4_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_4) + + .p2align 4 +L(ashr_4_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_5 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(11~15) n -11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +L(ashr_5): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -11(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $5, FLAGS + lea 5(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_5): + add $16, %edi + jg L(nibble_ashr_5) + +L(gobble_ashr_5): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_5) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_5) + + .p2align 4 +L(nibble_ashr_5): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xffe0, %esi + jnz L(ashr_5_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $11, REM + jbe L(ashr_5_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_5) + + .p2align 4 +L(ashr_5_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $5, %xmm0 + psrldq $5, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_6 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(10~15) n -10 5(15 +(n-10) - n) ashr_6 + */ + + .p2align 4 +L(ashr_6): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -10(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $6, FLAGS + lea 6(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_6): + add $16, %edi + jg L(nibble_ashr_6) + +L(gobble_ashr_6): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_6) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_6) + + .p2align 4 +L(nibble_ashr_6): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xffc0, %esi + jnz L(ashr_6_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $10, REM + jbe L(ashr_6_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_6) + + .p2align 4 +L(ashr_6_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $6, %xmm0 + psrldq $6, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_7 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n-9) - n) ashr_7 + */ + + .p2align 4 +L(ashr_7): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -9(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $7, FLAGS + lea 8(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_7): + add $16, %edi + jg L(nibble_ashr_7) + +L(gobble_ashr_7): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_7) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_7) + + .p2align 4 +L(nibble_ashr_7): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xff80, %esi + jnz L(ashr_7_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $9, REM + jbe L(ashr_7_exittail) +#endif + pxor %xmm0, %xmm0 + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_7) + + .p2align 4 +L(ashr_7_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $7, %xmm0 + psrldq $7, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_8 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n-8) - n) ashr_8 + */ + .p2align 4 +L(ashr_8): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -8(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $8, FLAGS + lea 8(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_8): + add $16, %edi + jg L(nibble_ashr_8) + +L(gobble_ashr_8): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_8) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_8) + + .p2align 4 +L(nibble_ashr_8): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xff00, %esi + jnz L(ashr_8_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $8, REM + jbe L(ashr_8_exittail) +#endif + pxor %xmm0, %xmm0 + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_8) + + .p2align 4 +L(ashr_8_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $8, %xmm0 + psrldq $8, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_9 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n-7) - n) ashr_9 + */ + .p2align 4 +L(ashr_9): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -7(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $9, FLAGS + lea 9(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_9): + add $16, %edi + jg L(nibble_ashr_9) + +L(gobble_ashr_9): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_9) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_9) + + .p2align 4 +L(nibble_ashr_9): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfe00, %esi + jnz L(ashr_9_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + jbe L(ashr_9_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_9) + + .p2align 4 +L(ashr_9_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $9, %xmm0 + psrldq $9, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_10 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n-6) - n) ashr_10 + */ + .p2align 4 +L(ashr_10): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -6(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $10, FLAGS + lea 10(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_10): + add $16, %edi + jg L(nibble_ashr_10) + +L(gobble_ashr_10): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_10) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_10) + + .p2align 4 +L(nibble_ashr_10): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfc00, %esi + jnz L(ashr_10_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + jbe L(ashr_10_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_10) + + .p2align 4 +L(ashr_10_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $10, %xmm0 + psrldq $10, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_11 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n-5) - n) ashr_11 + */ + .p2align 4 +L(ashr_11): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -5(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $11, FLAGS + lea 11(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_11): + add $16, %edi + jg L(nibble_ashr_11) + +L(gobble_ashr_11): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_11) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_11) + + .p2align 4 +L(nibble_ashr_11): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xf800, %esi + jnz L(ashr_11_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + jbe L(ashr_11_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_11) + + .p2align 4 +L(ashr_11_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $11, %xmm0 + psrldq $11, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_12 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n-4) - n) ashr_12 + */ + .p2align 4 +L(ashr_12): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -4(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $12, FLAGS + lea 12(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_12): + add $16, %edi + jg L(nibble_ashr_12) + +L(gobble_ashr_12): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_12) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_12) + + .p2align 4 +L(nibble_ashr_12): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xf000, %esi + jnz L(ashr_12_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + jbe L(ashr_12_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_12) + + .p2align 4 +L(ashr_12_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $12, %xmm0 + psrldq $12, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_13 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n-3) - n) ashr_13 + */ + .p2align 4 +L(ashr_13): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -3(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $13, FLAGS + lea 13(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_13): + add $16, %edi + jg L(nibble_ashr_13) + +L(gobble_ashr_13): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_13) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_13) + + .p2align 4 +L(nibble_ashr_13): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xe000, %esi + jnz L(ashr_13_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + jbe L(ashr_13_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_13) + + .p2align 4 +L(ashr_13_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $13, %xmm0 + psrldq $13, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n-2) - n) ashr_14 + */ + .p2align 4 +L(ashr_14): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -2(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $14, FLAGS + lea 14(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_14): + add $16, %edi + jg L(nibble_ashr_14) + +L(gobble_ashr_14): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_14) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_14) + + .p2align 4 +L(nibble_ashr_14): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xc000, %esi + jnz L(ashr_14_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + jbe L(ashr_14_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_14) + + .p2align 4 +L(ashr_14_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $14, %xmm0 + psrldq $14, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n-1) - n) ashr_15 + */ + + .p2align 4 +L(ashr_15): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -1(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $15, FLAGS + lea 15(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_15): + add $16, %edi + jg L(nibble_ashr_15) + +L(gobble_ashr_15): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_15) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_15) + + .p2align 4 +L(nibble_ashr_15): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0x8000, %esi + jnz L(ashr_15_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + jbe L(ashr_15_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_15) + + .p2align 4 +L(ashr_15_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $15, %xmm0 + psrldq $15, %xmm3 + jmp L(aftertail) + + .p2align 4 +L(aftertail): + TOLOWER (%xmm1, %xmm3) + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + not %esi +L(exit): + mov FLAGS, %edi + and $0x1f, %edi + lea -16(%edi, %ecx), %edi +L(less32bytes): + add %edi, %edx + add %ecx, %eax + testl $0x20, FLAGS + jz L(ret2) + xchg %eax, %edx + + .p2align 4 +L(ret2): + mov %esi, %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + addl $4, %esp + cfi_adjust_cfa_offset (-4) +#endif + POP (%esi) + POP (%edi) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + POP (FLAGS) +#endif +L(less16bytes): + test %cl, %cl + jz L(2next_8_bytes) + + test $0x01, %cl + jnz L(Byte0) + + test $0x02, %cl + jnz L(Byte1) + + test $0x04, %cl + jnz L(Byte2) + + test $0x08, %cl + jnz L(Byte3) + + test $0x10, %cl + jnz L(Byte4) + + test $0x20, %cl + jnz L(Byte5) + + test $0x40, %cl + jnz L(Byte6) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + jbe L(eq) +#endif + + movzx 7(%eax), %ecx + movzx 7(%edx), %eax +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte0): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $0, REM + jbe L(eq) +#endif + movzx (%eax), %ecx + movzx (%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte1): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + jbe L(eq) +#endif + movzx 1(%eax), %ecx + movzx 1(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte2): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + jbe L(eq) +#endif + movzx 2(%eax), %ecx + movzx 2(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte3): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + jbe L(eq) +#endif + movzx 3(%eax), %ecx + movzx 3(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte4): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + jbe L(eq) +#endif + movzx 4(%eax), %ecx + movzx 4(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte5): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + jbe L(eq) +#endif + movzx 5(%eax), %ecx + movzx 5(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte6): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + jbe L(eq) +#endif + movzx 6(%eax), %ecx + movzx 6(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(2next_8_bytes): + add $8, %eax + add $8, %edx +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $8, REM + lea -8(REM), REM + jbe L(eq) +#endif + + test $0x01, %ch + jnz L(Byte0) + + test $0x02, %ch + jnz L(Byte1) + + test $0x04, %ch + jnz L(Byte2) + + test $0x08, %ch + jnz L(Byte3) + + test $0x10, %ch + jnz L(Byte4) + + test $0x20, %ch + jnz L(Byte5) + + test $0x40, %ch + jnz L(Byte6) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + jbe L(eq) +#endif + movzx 7(%eax), %ecx + movzx 7(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +#ifdef USE_AS_STRNCMP +L(neq_sncmp): +#endif +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + addl $4, %esp + cfi_adjust_cfa_offset (-4) +#endif +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + POP (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + POP (%ebx) +# endif +#endif + ret + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + .p2align 4 + cfi_restore_state +L(more8byteseq): + +# ifdef USE_AS_STRNCASECMP_L + addl $4, %esp + cfi_adjust_cfa_offset (-4) +# endif + POP (%esi) + POP (%edi) +# ifdef USE_AS_STRNCMP + POP (FLAGS) +# endif +#endif + +#ifdef USE_AS_STRNCMP +L(eq_sncmp): +#endif +L(eq): + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + POP (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + POP (%ebx) +# endif +#endif + xorl %eax, %eax + ret + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + .p2align 4 +# if defined USE_AS_STRNCASECMP_L && defined PIC + CFI_PUSH (%ebx) +# endif + CFI_PUSH (REM) +L(less16bytes_sncmp): +# ifdef USE_AS_STRNCASECMP_L + PUSH (%esi) +# endif + test REM, REM + jz L(eq_sncmp) + + movzbl (%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl (%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, (%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $1, REM + je L(eq_sncmp) + + movzbl 1(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 1(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 1(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $2, REM + je L(eq_sncmp) + + movzbl 2(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 2(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 2(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $3, REM + je L(eq_sncmp) + + movzbl 3(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 3(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 3(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $4, REM + je L(eq_sncmp) + + movzbl 4(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 4(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 4(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $5, REM + je L(eq_sncmp) + + movzbl 5(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 5(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 5(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $6, REM + je L(eq_sncmp) + + movzbl 6(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 6(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 6(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $7, REM + je L(eq_sncmp) + + movzbl 7(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 7(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 7(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + + cmp $8, REM + je L(eq_sncmp) + + movzbl 8(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 8(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 8(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $9, REM + je L(eq_sncmp) + + movzbl 9(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 9(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 9(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $10, REM + je L(eq_sncmp) + + movzbl 10(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 10(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 10(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $11, REM + je L(eq_sncmp) + + movzbl 11(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 11(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 11(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + + cmp $12, REM + je L(eq_sncmp) + + movzbl 12(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 12(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 12(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $13, REM + je L(eq_sncmp) + + movzbl 13(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 13(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 13(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $14, REM + je L(eq_sncmp) + + movzbl 14(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 14(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 14(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $15, REM + je L(eq_sncmp) + + movzbl 15(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 15(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 15(%edx) +# endif + jne L(neq_sncmp) + +# ifdef USE_AS_STRNCASECMP_L +L(eq_sncmp): + POP (%esi) +# endif + POP (REM) +# if defined USE_AS_STRNCASECMP_L && defined PIC + POP (%ebx) +# endif + xor %eax, %eax + ret + +# ifdef USE_AS_STRNCASECMP_L + .p2align 4 +# ifdef PIC + CFI_PUSH (%ebx) +# endif + CFI_PUSH (REM) + CFI_PUSH (%esi) +L(neq_sncmp): + mov $1, %eax + mov $-1, %edx + cmovna %edx, %eax + POP (%esi) + POP (REM) +# ifdef PIC + POP (%ebx) +# endif + ret +# endif +#endif + +END (STRCMP) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S new file mode 100644 index 0000000000..56de25a4b7 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S @@ -0,0 +1,95 @@ +/* Multiple versions of strcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifdef USE_AS_STRNCMP +# define STRCMP strncmp +# define __GI_STRCMP __GI_strncmp +# define __STRCMP_IA32 __strncmp_ia32 +# define __STRCMP_SSSE3 __strncmp_ssse3 +# define __STRCMP_SSE4_2 __strncmp_sse4_2 +#elif defined USE_AS_STRCASECMP_L +# define STRCMP __strcasecmp_l +# define __GI_STRCMP __GI_strcasecmp_l +# define __STRCMP_IA32 __strcasecmp_l_ia32 +# define __STRCMP_SSSE3 __strcasecmp_l_ssse3 +# define __STRCMP_SSE4_2 __strcasecmp_l_sse4_2 +#elif defined USE_AS_STRNCASECMP_L +# define STRCMP __strncasecmp_l +# define __GI_STRCMP __GI_strncasecmp_l +# define __STRCMP_IA32 __strncasecmp_l_ia32 +# define __STRCMP_SSSE3 __strncasecmp_l_ssse3 +# define __STRCMP_SSE4_2 __strncasecmp_l_sse4_2 +#else +# define STRCMP strcmp +# define __GI_STRCMP __GI_strcmp +# define __STRCMP_IA32 __strcmp_ia32 +# define __STRCMP_SSSE3 __strcmp_ssse3 +# define __STRCMP_SSE4_2 __strcmp_sse4_2 +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncmp in static library since we + need strncmp before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc) + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__STRCMP_IA32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2) +2: ret +END(STRCMP) + +# undef ENTRY +# define ENTRY(name) \ + .type __STRCMP_IA32, @function; \ + .p2align 4; \ + .globl __STRCMP_IA32; \ + .hidden __STRCMP_IA32; \ + __STRCMP_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32 +# endif +#endif + +#if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L \ + && !defined USE_AS_STRNCASECMP_L +# include "../strcmp.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S new file mode 100644 index 0000000000..ed627a5f62 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S @@ -0,0 +1,2250 @@ +/* strcpy with SSE2 and unaligned load + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# include <sysdep.h> + + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCPY +# define STRCPY __strcpy_sse2 +# endif + +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# ifdef USE_AS_STRNCPY +# define PARMS 16 +# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi) +# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; \ + CFI_PUSH(%ebx); CFI_PUSH(%esi); CFI_PUSH(%edi); + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. + INDEX is a register contains the index into the jump table. + SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + SETUP_PIC_REG(cx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjusted ECX. Go. */ \ + jmp *%ecx +# else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + +.text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edi + mov STR2(%esp), %esi + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) + + mov %esi, %ecx +# ifndef USE_AS_STPCPY + mov %edi, %eax /* save result */ +# endif + and $15, %ecx + jz L(SourceStringAlignmentZero) + + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%esi), %xmm1 + add %ecx, %ebx + pmovmskb %xmm1, %edx + shr %cl, %edx +# ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# else + cmp $33, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%edi) + + sub %ecx, %edi + +/* If source address alignment != destination address alignment */ + .p2align 4 +L(Unalign16Both): + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%edi, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%edi, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm1) + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) + + movdqu %xmm3, (%edi, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %edi + lea 128(%ebx, %edx), %ebx + +L(Unaligned64Loop): + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) + test %edx, %edx + jnz L(Unaligned64Leave) +L(Unaligned64Loop_start): + add $64, %edi + add $64, %esi + movdqu %xmm4, -64(%edi) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%edi) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%edi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%edi) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) + test %edx, %edx + jz L(Unaligned64Loop_start) +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) + movdqu %xmm6, 32(%edi) +# ifdef USE_AS_STPCPY + lea 48(%edi, %edx), %eax +# endif + movdqu %xmm7, 48(%edi) + add $15, %ebx + sub %edx, %ebx + lea 49(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + +/* If source address alignment == destination address alignment */ + +L(SourceStringAlignmentZero): + pxor %xmm0, %xmm0 + movdqa (%esi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb 16(%esi), %xmm0 + movdqu %xmm1, (%edi) + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# else + cmp $33, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + jmp L(Unalign16Both) + +/*-----------------End of main part---------------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16BytesTail): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %edi + sub $16, %ebx +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + sub %ecx, %ebx + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx +# ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +# endif + movdqu %xmm4, (%edi) + add $63, %ebx + sub %edx, %ebx + lea 1(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi, %edx), %eax +# endif + movdqu %xmm5, 16(%edi) + add $47, %ebx + sub %edx, %ebx + lea 17(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) +# ifdef USE_AS_STPCPY + lea 32(%edi, %edx), %eax +# endif + movdqu %xmm6, 32(%edi) + add $31, %ebx + sub %edx, %ebx + lea 33(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %edi + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(Exit0): +# ifdef USE_AS_STPCPY + mov %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit1): + movb %dh, (%edi) +# ifdef USE_AS_STPCPY + lea (%edi), %eax +# endif + sub $1, %ebx + lea 1(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit2): + movw (%esi), %dx + movw %dx, (%edi) +# ifdef USE_AS_STPCPY + lea 1(%edi), %eax +# endif + sub $2, %ebx + lea 2(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit3): + movw (%esi), %cx + movw %cx, (%edi) + movb %dh, 2(%edi) +# ifdef USE_AS_STPCPY + lea 2(%edi), %eax +# endif + sub $3, %ebx + lea 3(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit4): + movl (%esi), %edx + movl %edx, (%edi) +# ifdef USE_AS_STPCPY + lea 3(%edi), %eax +# endif + sub $4, %ebx + lea 4(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit5): + movl (%esi), %ecx + movb %dh, 4(%edi) + movl %ecx, (%edi) +# ifdef USE_AS_STPCPY + lea 4(%edi), %eax +# endif + sub $5, %ebx + lea 5(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%edi) + movw %dx, 4(%edi) +# ifdef USE_AS_STPCPY + lea 5(%edi), %eax +# endif + sub $6, %ebx + lea 6(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%edi) + movl %edx, 3(%edi) +# ifdef USE_AS_STPCPY + lea 6(%edi), %eax +# endif + sub $7, %ebx + lea 7(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 7(%edi), %eax +# endif + sub $8, %ebx + lea 8(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit9): + movlpd (%esi), %xmm0 + movb %dh, 8(%edi) + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 8(%edi), %eax +# endif + sub $9, %ebx + lea 9(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 9(%edi), %eax +# endif + sub $10, %ebx + lea 10(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) +# ifdef USE_AS_STPCPY + lea 10(%edi), %eax +# endif + sub $11, %ebx + lea 11(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 11(%edi), %eax +# endif + sub $12, %ebx + lea 12(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 5(%edi) +# ifdef USE_AS_STPCPY + lea 12(%edi), %eax +# endif + sub $13, %ebx + lea 13(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 6(%edi) +# ifdef USE_AS_STPCPY + lea 13(%edi), %eax +# endif + sub $14, %ebx + lea 14(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 7(%edi) +# ifdef USE_AS_STPCPY + lea 14(%edi), %eax +# endif + sub $15, %ebx + lea 15(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 15(%edi), %eax +# endif + sub $16, %ebx + lea 16(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit17): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) + movb %dh, 16(%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi), %eax +# endif + sub $17, %ebx + lea 17(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%edi) + movw %cx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 17(%edi), %eax +# endif + sub $18, %ebx + lea 18(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 15(%edi) +# ifdef USE_AS_STPCPY + lea 18(%edi), %eax +# endif + sub $19, %ebx + lea 19(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 19(%edi), %eax +# endif + sub $20, %ebx + lea 20(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) + movb %dh, 20(%edi) +# ifdef USE_AS_STPCPY + lea 20(%edi), %eax +# endif + sub $21, %ebx + lea 21(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 14(%edi) +# ifdef USE_AS_STPCPY + lea 21(%edi), %eax +# endif + sub $22, %ebx + lea 22(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 15(%edi) +# ifdef USE_AS_STPCPY + lea 22(%edi), %eax +# endif + sub $23, %ebx + lea 23(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 23(%edi), %eax +# endif + sub $24, %ebx + lea 24(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movb %dh, 24(%edi) +# ifdef USE_AS_STPCPY + lea 24(%edi), %eax +# endif + sub $25, %ebx + lea 25(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movw %cx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 25(%edi), %eax +# endif + sub $26, %ebx + lea 26(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 23(%edi) +# ifdef USE_AS_STPCPY + lea 26(%edi), %eax +# endif + sub $27, %ebx + lea 27(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 27(%edi), %eax +# endif + sub $28, %ebx + lea 28(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 13(%edi) +# ifdef USE_AS_STPCPY + lea 28(%edi), %eax +# endif + sub $29, %ebx + lea 29(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 14(%edi) +# ifdef USE_AS_STPCPY + lea 29(%edi), %eax +# endif + sub $30, %ebx + lea 30(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + + .p2align 4 +L(Exit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 15(%edi) +# ifdef USE_AS_STPCPY + lea 30(%edi), %eax +# endif + sub $31, %ebx + lea 31(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 31(%edi), %eax +# endif + sub $32, %ebx + lea 32(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(StrncpyExit1): + movb (%esi), %dl + movb %dl, (%edi) +# ifdef USE_AS_STPCPY + lea 1(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit2): + movw (%esi), %dx + movw %dx, (%edi) +# ifdef USE_AS_STPCPY + lea 2(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit3): + movw (%esi), %cx + movb 2(%esi), %dl + movw %cx, (%edi) + movb %dl, 2(%edi) +# ifdef USE_AS_STPCPY + lea 3(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit4): + movl (%esi), %edx + movl %edx, (%edi) +# ifdef USE_AS_STPCPY + lea 4(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit5): + movl (%esi), %ecx + movb 4(%esi), %dl + movl %ecx, (%edi) + movb %dl, 4(%edi) +# ifdef USE_AS_STPCPY + lea 5(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%edi) + movw %dx, 4(%edi) +# ifdef USE_AS_STPCPY + lea 6(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%edi) + movl %edx, 3(%edi) +# ifdef USE_AS_STPCPY + lea 7(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 8(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit9): + movlpd (%esi), %xmm0 + movb 8(%esi), %dl + movlpd %xmm0, (%edi) + movb %dl, 8(%edi) +# ifdef USE_AS_STPCPY + lea 9(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 10(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) +# ifdef USE_AS_STPCPY + lea 11(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 12(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 5(%edi) +# ifdef USE_AS_STPCPY + lea 13(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 6(%edi) +# ifdef USE_AS_STPCPY + lea 14(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 7(%edi) +# ifdef USE_AS_STPCPY + lea 15(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit17): + movdqu (%esi), %xmm0 + movb 16(%esi), %cl + movdqu %xmm0, (%edi) + movb %cl, 16(%edi) +# ifdef USE_AS_STPCPY + lea 17(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%edi) + movw %cx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 18(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 15(%edi) +# ifdef USE_AS_STPCPY + lea 19(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 20(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movb 20(%esi), %dl + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) + movb %dl, 20(%edi) +# ifdef USE_AS_STPCPY + lea 21(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 14(%edi) +# ifdef USE_AS_STPCPY + lea 22(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 15(%edi) +# ifdef USE_AS_STPCPY + lea 23(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 24(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movb 24(%esi), %cl + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movb %cl, 24(%edi) +# ifdef USE_AS_STPCPY + lea 25(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movw %cx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 26(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 23(%edi) +# ifdef USE_AS_STPCPY + lea 27(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 28(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 13(%edi) +# ifdef USE_AS_STPCPY + lea 29(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 14(%edi) +# ifdef USE_AS_STPCPY + lea 30(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 15(%edi) +# ifdef USE_AS_STPCPY + lea 31(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 32(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit33): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movb 32(%esi), %cl + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) + movb %cl, 32(%edi) + RETURN + + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + movb %dl, (%edi) + RETURN + + .p2align 4 +L(Fill2): + movw %dx, (%edi) + RETURN + + .p2align 4 +L(Fill3): + movl %edx, -1(%edi) + RETURN + + .p2align 4 +L(Fill4): + movl %edx, (%edi) + RETURN + + .p2align 4 +L(Fill5): + movl %edx, (%edi) + movb %dl, 4(%edi) + RETURN + + .p2align 4 +L(Fill6): + movl %edx, (%edi) + movw %dx, 4(%edi) + RETURN + + .p2align 4 +L(Fill7): + movlpd %xmm0, -1(%edi) + RETURN + + .p2align 4 +L(Fill8): + movlpd %xmm0, (%edi) + RETURN + + .p2align 4 +L(Fill9): + movlpd %xmm0, (%edi) + movb %dl, 8(%edi) + RETURN + + .p2align 4 +L(Fill10): + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) + RETURN + + .p2align 4 +L(Fill11): + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) + RETURN + + .p2align 4 +L(Fill12): + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) + RETURN + + .p2align 4 +L(Fill13): + movlpd %xmm0, (%edi) + movlpd %xmm0, 5(%edi) + RETURN + + .p2align 4 +L(Fill14): + movlpd %xmm0, (%edi) + movlpd %xmm0, 6(%edi) + RETURN + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%edi) + RETURN + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%edi) + RETURN + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%edi, %ecx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %edx, %edx + add $15, %ebx + add %ecx, %edi +# ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +# endif + sub %edx, %ebx + lea 1(%edi, %edx), %edi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %edx, %edx + sub $16, %ebx + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%edi) + add $16, %edi + + mov %edi, %esi + and $0xf, %esi + sub %esi, %edi + add %esi, %ebx + sub $64, %ebx + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%edi) + movdqa %xmm0, 16(%edi) + movdqa %xmm0, 32(%edi) + movdqa %xmm0, 48(%edi) + add $64, %edi + sub $64, %ebx + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %ebx + jl L(StrncpyFillLess32) + movdqa %xmm0, (%edi) + movdqa %xmm0, 16(%edi) + add $32, %edi + sub $16, %ebx + jl L(StrncpyFillExit) + movdqa %xmm0, (%edi) + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + +L(StrncpyFillLess32): + add $16, %ebx + jl L(StrncpyFillExit) + movdqa %xmm0, (%edi) + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + +L(StrncpyFillExit): + add $16, %ebx + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%edi) +# ifdef USE_AS_STPCPY + lea 64(%edi), %eax +# endif + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm4, (%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm5) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm5, 16(%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm6) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm6, 32(%edi) + lea 16(%edi, %ecx), %edi + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(ExitZero): + movl %edi, %eax + RETURN + +END (STRCPY) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) + +L(ExitStrncpyTable): + .int JMPTBL(L(Exit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) + + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +# else +# define PARMS 4 +# define ENTRANCE +# define RETURN POP (%edi); ret; CFI_PUSH (%edi) +# define RETURN1 ret + + .text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edx + mov STR2(%esp), %ecx + + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + cmpb $0, 7(%ecx) + jz L(ExitTail8) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + cmpb $0, 14(%ecx) + jz L(ExitTail15) + cmpb $0, 15(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + PUSH (%ebx) + + mov %edx, %edi + lea 16(%ecx), %ebx + and $-16, %ebx + pxor %xmm0, %xmm0 + movdqu (%ecx), %xmm1 + movdqu %xmm1, (%edx) + pcmpeqb (%ebx), %xmm0 + pmovmskb %xmm0, %eax + sub %ecx, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %ecx, %eax + lea 16(%ecx), %ecx + and $-16, %ecx + sub %ecx, %eax + sub %eax, %edx + xor %ebx, %ebx + + .p2align 4 + movdqa (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movdqu %xmm1, (%edx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm3 + movdqu %xmm2, (%edx, %ebx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm4 + movdqu %xmm3, (%edx, %ebx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm1 + movdqu %xmm4, (%edx, %ebx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm2 + movdqu %xmm1, (%edx, %ebx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm3 + movdqu %xmm2, (%edx, %ebx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm3, (%edx, %ebx) + mov %ecx, %eax + lea 16(%ecx, %ebx), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps 32(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + add $64, %ecx + pminub %xmm7, %xmm3 + add $64, %edx + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(Aligned64Leave) +L(Aligned64Loop_start): + movdqu %xmm4, -64(%edx) + movaps (%ecx), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%edx) + movaps 16(%ecx), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%ecx), %xmm3 + movdqu %xmm6, -32(%edx) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%edx) + movaps 48(%ecx), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $64, %edx + add $64, %ecx + test %eax, %eax + jz L(Aligned64Loop_start) +L(Aligned64Leave): + sub $0xa0, %ebx + pxor %xmm0, %xmm0 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movdqu %xmm4, -64(%edx) + test %eax, %eax + lea 16(%ebx), %ebx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movdqu %xmm5, -48(%edx) + test %eax, %eax + lea 16(%ebx), %ebx + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm6, -32(%edx) + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%ebx), %ebx + +/*-----------------End of main part---------------------------*/ + + .p2align 4 +L(CopyFrom1To16Bytes): + add %ebx, %edx + add %ebx, %ecx + + POP (%ebx) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + /* Exit 8 */ + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + /* Exit 16 */ + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm0 + movlpd %xmm0, 8(%edx) +# ifdef USE_AS_STPCPY + lea 15(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) +# ifdef USE_AS_STPCPY + lea (%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) +# ifdef USE_AS_STPCPY + lea 1(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) +# ifdef USE_AS_STPCPY + lea 2(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) +# ifdef USE_AS_STPCPY + lea 3(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) +# ifdef USE_AS_STPCPY + lea 4(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 5(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) +# ifdef USE_AS_STPCPY + lea 6(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit9): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) +# ifdef USE_AS_STPCPY + lea 8(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit10): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 9(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit11): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) +# ifdef USE_AS_STPCPY + lea 10(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit12): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 11(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) +# ifdef USE_AS_STPCPY + lea 12(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) +# ifdef USE_AS_STPCPY + lea 13(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + +CFI_POP (%edi) + + .p2align 4 +L(ExitTail1): + movb (%ecx), %al + movb %al, (%edx) + movl %edx, %eax + RETURN1 + + .p2align 4 +L(ExitTail2): + movw (%ecx), %ax + movw %ax, (%edx) +# ifdef USE_AS_STPCPY + lea 1(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) +# ifdef USE_AS_STPCPY + lea 2(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) +# ifdef USE_AS_STPCPY + lea 3(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) +# ifdef USE_AS_STPCPY + lea 4(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 5(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) +# ifdef USE_AS_STPCPY + lea 6(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail8): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail9): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) +# ifdef USE_AS_STPCPY + lea 8(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail10): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 9(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail11): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) +# ifdef USE_AS_STPCPY + lea 10(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail12): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 11(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) +# ifdef USE_AS_STPCPY + lea 12(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) +# ifdef USE_AS_STPCPY + lea 13(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail16): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm0 + movlpd %xmm0, 8(%edx) +# ifdef USE_AS_STPCPY + lea 15(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + +END (STRCPY) +# endif + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S new file mode 100644 index 0000000000..effd85da94 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S @@ -0,0 +1,3901 @@ +/* strcpy with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCPY +# define STRCPY __strcpy_ssse3 +# endif + +# ifdef USE_AS_STRNCPY +# define PARMS 8 +# define ENTRANCE PUSH (%ebx) +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx); +# define RETURN1 POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi) +# else +# define PARMS 4 +# define ENTRANCE +# define RETURN ret +# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) +# endif + +# ifdef USE_AS_STPCPY +# define SAVE_RESULT(n) lea n(%edx), %eax +# define SAVE_RESULT_TAIL(n) lea n(%edx), %eax +# else +# define SAVE_RESULT(n) movl %edi, %eax +# define SAVE_RESULT_TAIL(n) movl %edx, %eax +# endif + +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +/* In this code following instructions are used for copying: + movb - 1 byte + movw - 2 byte + movl - 4 byte + movlpd - 8 byte + movaps - 16 byte - requires 16 byte alignment + of sourse and destination adresses. +*/ + +.text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edx + mov STR2(%esp), %ecx +# ifdef USE_AS_STRNCPY + movl LEN(%esp), %ebx + cmp $8, %ebx + jbe L(StrncpyExit8Bytes) +# endif + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + cmpb $0, 7(%ecx) + jz L(ExitTail8) +# ifdef USE_AS_STRNCPY + cmp $16, %ebx + jb L(StrncpyExit15Bytes) +# endif + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + cmpb $0, 14(%ecx) + jz L(ExitTail15) +# ifdef USE_AS_STRNCPY + cmp $16, %ebx + je L(ExitTail16) +# endif + cmpb $0, 15(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + mov %edx, %edi +# endif + PUSH (%esi) +# ifdef USE_AS_STRNCPY + mov %ecx, %esi + sub $16, %ebx + and $0xf, %esi + +/* add 16 bytes ecx_offset to ebx */ + + add %esi, %ebx +# endif + lea 16(%ecx), %esi + and $-16, %esi + pxor %xmm0, %xmm0 + movlpd (%ecx), %xmm1 + movlpd %xmm1, (%edx) + + pcmpeqb (%esi), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + + pmovmskb %xmm0, %eax + sub %ecx, %esi + +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %edx, %eax + lea 16(%edx), %edx + and $-16, %edx + sub %edx, %eax + +# ifdef USE_AS_STRNCPY + add %eax, %esi + lea -1(%esi), %esi + and $1<<31, %esi + test %esi, %esi + jnz L(ContinueCopy) + lea 16(%ebx), %ebx + +L(ContinueCopy): +# endif + sub %eax, %ecx + mov %ecx, %eax + and $0xf, %eax + mov $0, %esi + +/* case: ecx_offset == edx_offset */ + + jz L(Align16Both) + + cmp $8, %eax + jae L(ShlHigh8) + cmp $1, %eax + je L(Shl1) + cmp $2, %eax + je L(Shl2) + cmp $3, %eax + je L(Shl3) + cmp $4, %eax + je L(Shl4) + cmp $5, %eax + je L(Shl5) + cmp $6, %eax + je L(Shl6) + jmp L(Shl7) + +L(ShlHigh8): + je L(Shl8) + cmp $9, %eax + je L(Shl9) + cmp $10, %eax + je L(Shl10) + cmp $11, %eax + je L(Shl11) + cmp $12, %eax + je L(Shl12) + cmp $13, %eax + je L(Shl13) + cmp $14, %eax + je L(Shl14) + jmp L(Shl15) + +L(Align16Both): + movaps (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movaps %xmm1, (%edx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm4 + movaps %xmm3, (%edx, %esi) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm1 + movaps %xmm4, (%edx, %esi) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm2 + movaps %xmm1, (%edx, %esi) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%edx, %esi) + mov %ecx, %eax + lea 16(%ecx, %esi), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx +# ifdef USE_AS_STRNCPY + lea 112(%ebx, %eax), %ebx +# endif + mov $-0x40, %esi + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps 32(%ecx), %xmm3 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + lea 64(%edx), %edx + pcmpeqb %xmm0, %xmm3 + lea 64(%ecx), %ecx + pmovmskb %xmm3, %eax +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeaveCase2OrCase3) +# endif + test %eax, %eax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%edx) + movaps %xmm5, -48(%edx) + movaps %xmm6, -32(%edx) + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): +# ifdef USE_AS_STRNCPY + lea 48(%ebx), %ebx +# endif + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%edx) + pcmpeqb %xmm7, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl1): + movaps -1(%ecx), %xmm1 + movaps 15(%ecx), %xmm2 +L(Shl1Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 31(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -15(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -1(%ecx), %xmm1 + +L(Shl1LoopStart): + movaps 15(%ecx), %xmm2 + movaps 31(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 47(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 63(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + test %eax, %eax + palignr $1, %xmm3, %xmm4 + jnz L(Shl1Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave1) +# endif + palignr $1, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $1, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl1LoopStart) + +L(Shl1LoopExit): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + mov $15, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl2): + movaps -2(%ecx), %xmm1 + movaps 14(%ecx), %xmm2 +L(Shl2Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 30(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -14(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -2(%ecx), %xmm1 + +L(Shl2LoopStart): + movaps 14(%ecx), %xmm2 + movaps 30(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 46(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 62(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + test %eax, %eax + palignr $2, %xmm3, %xmm4 + jnz L(Shl2Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave2) +# endif + palignr $2, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $2, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl2LoopStart) + +L(Shl2LoopExit): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + mov $14, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl3): + movaps -3(%ecx), %xmm1 + movaps 13(%ecx), %xmm2 +L(Shl3Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 29(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -13(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -3(%ecx), %xmm1 + +L(Shl3LoopStart): + movaps 13(%ecx), %xmm2 + movaps 29(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 45(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 61(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + test %eax, %eax + palignr $3, %xmm3, %xmm4 + jnz L(Shl3Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave3) +# endif + palignr $3, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $3, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl3LoopStart) + +L(Shl3LoopExit): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + mov $13, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl4): + movaps -4(%ecx), %xmm1 + movaps 12(%ecx), %xmm2 +L(Shl4Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 28(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -12(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -4(%ecx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%ecx), %xmm2 + movaps 28(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %eax, %eax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave4) +# endif + palignr $4, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + mov $12, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl5): + movaps -5(%ecx), %xmm1 + movaps 11(%ecx), %xmm2 +L(Shl5Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 27(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -11(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -5(%ecx), %xmm1 + +L(Shl5LoopStart): + movaps 11(%ecx), %xmm2 + movaps 27(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 43(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 59(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + test %eax, %eax + palignr $5, %xmm3, %xmm4 + jnz L(Shl5Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave5) +# endif + palignr $5, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $5, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl5LoopStart) + +L(Shl5LoopExit): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 7(%edx) + mov $11, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl6): + movaps -6(%ecx), %xmm1 + movaps 10(%ecx), %xmm2 +L(Shl6Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 26(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -10(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -6(%ecx), %xmm1 + +L(Shl6LoopStart): + movaps 10(%ecx), %xmm2 + movaps 26(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 42(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 58(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + test %eax, %eax + palignr $6, %xmm3, %xmm4 + jnz L(Shl6Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave6) +# endif + palignr $6, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $6, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl6LoopStart) + +L(Shl6LoopExit): + movlpd (%ecx), %xmm0 + movl 6(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 6(%edx) + mov $10, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl7): + movaps -7(%ecx), %xmm1 + movaps 9(%ecx), %xmm2 +L(Shl7Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 25(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -9(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -7(%ecx), %xmm1 + +L(Shl7LoopStart): + movaps 9(%ecx), %xmm2 + movaps 25(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 41(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 57(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + test %eax, %eax + palignr $7, %xmm3, %xmm4 + jnz L(Shl7Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave7) +# endif + palignr $7, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $7, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl7LoopStart) + +L(Shl7LoopExit): + movlpd (%ecx), %xmm0 + movl 5(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 5(%edx) + mov $9, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%ecx), %xmm1 + movaps 8(%ecx), %xmm2 +L(Shl8Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 24(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -8(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -8(%ecx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%ecx), %xmm2 + movaps 24(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %eax, %eax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave8) +# endif + palignr $8, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + mov $8, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl9): + movaps -9(%ecx), %xmm1 + movaps 7(%ecx), %xmm2 +L(Shl9Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 23(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -7(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -9(%ecx), %xmm1 + +L(Shl9LoopStart): + movaps 7(%ecx), %xmm2 + movaps 23(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 39(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 55(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + test %eax, %eax + palignr $9, %xmm3, %xmm4 + jnz L(Shl9Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave9) +# endif + palignr $9, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $9, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl9LoopStart) + +L(Shl9LoopExit): + movlpd -1(%ecx), %xmm0 + movlpd %xmm0, -1(%edx) + mov $7, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl10): + movaps -10(%ecx), %xmm1 + movaps 6(%ecx), %xmm2 +L(Shl10Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 22(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -6(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -10(%ecx), %xmm1 + +L(Shl10LoopStart): + movaps 6(%ecx), %xmm2 + movaps 22(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 38(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 54(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + test %eax, %eax + palignr $10, %xmm3, %xmm4 + jnz L(Shl10Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave10) +# endif + palignr $10, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $10, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl10LoopStart) + +L(Shl10LoopExit): + movlpd -2(%ecx), %xmm0 + movlpd %xmm0, -2(%edx) + mov $6, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl11): + movaps -11(%ecx), %xmm1 + movaps 5(%ecx), %xmm2 +L(Shl11Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 21(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -5(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -11(%ecx), %xmm1 + +L(Shl11LoopStart): + movaps 5(%ecx), %xmm2 + movaps 21(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 37(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 53(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + test %eax, %eax + palignr $11, %xmm3, %xmm4 + jnz L(Shl11Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave11) +# endif + palignr $11, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $11, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl11LoopStart) + +L(Shl11LoopExit): + movlpd -3(%ecx), %xmm0 + movlpd %xmm0, -3(%edx) + mov $5, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%ecx), %xmm1 + movaps 4(%ecx), %xmm2 +L(Shl12Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 20(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -4(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -12(%ecx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%ecx), %xmm2 + movaps 20(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %eax, %eax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave12) +# endif + palignr $12, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movl (%ecx), %esi + movl %esi, (%edx) + mov $4, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl13): + movaps -13(%ecx), %xmm1 + movaps 3(%ecx), %xmm2 +L(Shl13Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 19(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -3(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -13(%ecx), %xmm1 + +L(Shl13LoopStart): + movaps 3(%ecx), %xmm2 + movaps 19(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 35(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 51(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + test %eax, %eax + palignr $13, %xmm3, %xmm4 + jnz L(Shl13Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave13) +# endif + palignr $13, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $13, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl13LoopStart) + +L(Shl13LoopExit): + movl -1(%ecx), %esi + movl %esi, -1(%edx) + mov $3, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl14): + movaps -14(%ecx), %xmm1 + movaps 2(%ecx), %xmm2 +L(Shl14Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 18(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -2(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -14(%ecx), %xmm1 + +L(Shl14LoopStart): + movaps 2(%ecx), %xmm2 + movaps 18(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 34(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 50(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + test %eax, %eax + palignr $14, %xmm3, %xmm4 + jnz L(Shl14Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave14) +# endif + palignr $14, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $14, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl14LoopStart) + +L(Shl14LoopExit): + movl -2(%ecx), %esi + movl %esi, -2(%edx) + mov $2, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl15): + movaps -15(%ecx), %xmm1 + movaps 1(%ecx), %xmm2 +L(Shl15Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 17(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -1(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -15(%ecx), %xmm1 + +L(Shl15LoopStart): + movaps 1(%ecx), %xmm2 + movaps 17(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 33(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 49(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + test %eax, %eax + palignr $15, %xmm3, %xmm4 + jnz L(Shl15Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave15) +# endif + palignr $15, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $15, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl15LoopStart) + +L(Shl15LoopExit): + movl -3(%ecx), %esi + movl %esi, -3(%edx) + mov $1, %esi +# ifdef USE_AS_STRCAT + jmp L(CopyFrom1To16Bytes) +# endif + + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(CopyFrom1To16Bytes): +# ifdef USE_AS_STRNCPY + add $16, %ebx +# endif + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh8) + +L(CopyFrom1To16BytesLess8): + mov %al, %ah + and $15, %ah + jz L(ExitHigh4) + + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT (3) +# ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh4): + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + + .p2align 4 +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + SAVE_RESULT (7) +# ifdef USE_AS_STRNCPY + sub $8, %ebx + lea 8(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh8): + mov %ah, %al + and $15, %al + jz L(ExitHigh12) + + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT (11) +# ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh12): + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + + .p2align 4 +L(Exit16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + SAVE_RESULT (15) +# ifdef USE_AS_STRNCPY + sub $16, %ebx + lea 16(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + +# ifdef USE_AS_STRNCPY + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %esi, %ecx + add %esi, %edx + + POP (%esi) + + test %al, %al + jz L(ExitHighCase2) + + cmp $8, %ebx + ja L(CopyFrom1To16BytesLess8) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %ebx + je L(Exit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %ebx + je L(Exit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %ebx + je L(Exit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %ebx + je L(Exit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %ebx + je L(Exit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %ebx + je L(Exit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %ebx + je L(Exit7) + jmp L(Exit8) + + .p2align 4 +L(ExitHighCase2): + cmp $8, %ebx + jbe L(CopyFrom1To16BytesLess8Case3) + + test $0x01, %ah + jnz L(Exit9) + cmp $9, %ebx + je L(Exit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %ebx + je L(Exit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %ebx + je L(Exit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %ebx + je L(Exit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %ebx + je L(Exit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %ebx + je L(Exit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %ebx + je L(Exit15) + jmp L(Exit16) + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %esi, %edx + add %esi, %ecx + + POP (%esi) + + cmp $8, %ebx + ja L(ExitHigh8Case3) + +L(CopyFrom1To16BytesLess8Case3): + cmp $4, %ebx + ja L(ExitHigh4Case3) + + cmp $1, %ebx + je L(Exit1) + cmp $2, %ebx + je L(Exit2) + cmp $3, %ebx + je L(Exit3) + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT (4) + RETURN1 + + .p2align 4 +L(ExitHigh4Case3): + cmp $5, %ebx + je L(Exit5) + cmp $6, %ebx + je L(Exit6) + cmp $7, %ebx + je L(Exit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + SAVE_RESULT (8) + RETURN1 + + .p2align 4 +L(ExitHigh8Case3): + cmp $12, %ebx + ja L(ExitHigh12Case3) + + cmp $9, %ebx + je L(Exit9) + cmp $10, %ebx + je L(Exit10) + cmp $11, %ebx + je L(Exit11) + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT (12) + RETURN1 + + .p2align 4 +L(ExitHigh12Case3): + cmp $13, %ebx + je L(Exit13) + cmp $14, %ebx + je L(Exit14) + cmp $15, %ebx + je L(Exit15) + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + SAVE_RESULT (16) + RETURN1 + +# endif + + .p2align 4 +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) + SAVE_RESULT (0) +# ifdef USE_AS_STRNCPY + sub $1, %ebx + lea 1(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) + SAVE_RESULT (1) +# ifdef USE_AS_STRNCPY + sub $2, %ebx + lea 2(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + SAVE_RESULT (2) +# ifdef USE_AS_STRNCPY + sub $3, %ebx + lea 3(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + SAVE_RESULT (4) +# ifdef USE_AS_STRNCPY + sub $5, %ebx + lea 5(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + SAVE_RESULT (5) +# ifdef USE_AS_STRNCPY + sub $6, %ebx + lea 6(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + SAVE_RESULT (6) +# ifdef USE_AS_STRNCPY + sub $7, %ebx + lea 7(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit9): + movlpd (%ecx), %xmm0 + movb 8(%ecx), %al + movlpd %xmm0, (%edx) + movb %al, 8(%edx) + SAVE_RESULT (8) +# ifdef USE_AS_STRNCPY + sub $9, %ebx + lea 9(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit10): + movlpd (%ecx), %xmm0 + movw 8(%ecx), %ax + movlpd %xmm0, (%edx) + movw %ax, 8(%edx) + SAVE_RESULT (9) +# ifdef USE_AS_STRNCPY + sub $10, %ebx + lea 10(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit11): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 7(%edx) + SAVE_RESULT (10) +# ifdef USE_AS_STRNCPY + sub $11, %ebx + lea 11(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + SAVE_RESULT (12) +# ifdef USE_AS_STRNCPY + sub $13, %ebx + lea 13(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + SAVE_RESULT (13) +# ifdef USE_AS_STRNCPY + sub $14, %ebx + lea 14(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) + SAVE_RESULT (14) +# ifdef USE_AS_STRNCPY + sub $15, %ebx + lea 15(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + +CFI_POP (%edi) + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + movb %dl, (%ecx) + RETURN + + .p2align 4 +L(Fill2): + movw %dx, (%ecx) + RETURN + + .p2align 4 +L(Fill3): + movw %dx, (%ecx) + movb %dl, 2(%ecx) + RETURN + + .p2align 4 +L(Fill4): + movl %edx, (%ecx) + RETURN + + .p2align 4 +L(Fill5): + movl %edx, (%ecx) + movb %dl, 4(%ecx) + RETURN + + .p2align 4 +L(Fill6): + movl %edx, (%ecx) + movw %dx, 4(%ecx) + RETURN + + .p2align 4 +L(Fill7): + movl %edx, (%ecx) + movl %edx, 3(%ecx) + RETURN + + .p2align 4 +L(Fill8): + movlpd %xmm0, (%ecx) + RETURN + + .p2align 4 +L(Fill9): + movlpd %xmm0, (%ecx) + movb %dl, 8(%ecx) + RETURN + + .p2align 4 +L(Fill10): + movlpd %xmm0, (%ecx) + movw %dx, 8(%ecx) + RETURN + + .p2align 4 +L(Fill11): + movlpd %xmm0, (%ecx) + movl %edx, 7(%ecx) + RETURN + + .p2align 4 +L(Fill12): + movlpd %xmm0, (%ecx) + movl %edx, 8(%ecx) + RETURN + + .p2align 4 +L(Fill13): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 5(%ecx) + RETURN + + .p2align 4 +L(Fill14): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 6(%ecx) + RETURN + + .p2align 4 +L(Fill15): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 7(%ecx) + RETURN + + .p2align 4 +L(Fill16): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 8(%ecx) + RETURN + + .p2align 4 +L(StrncpyFillExit1): + lea 16(%ebx), %ebx +L(FillFrom1To16Bytes): + test %ebx, %ebx + jz L(Fill0) + cmp $16, %ebx + je L(Fill16) + cmp $8, %ebx + je L(Fill8) + jg L(FillMore8) + cmp $4, %ebx + je L(Fill4) + jg L(FillMore4) + cmp $2, %ebx + jl L(Fill1) + je L(Fill2) + jg L(Fill3) +L(FillMore8): /* but less than 16 */ + cmp $12, %ebx + je L(Fill12) + jl L(FillLess12) + cmp $14, %ebx + jl L(Fill13) + je L(Fill14) + jg L(Fill15) +L(FillMore4): /* but less than 8 */ + cmp $6, %ebx + jl L(Fill5) + je L(Fill6) + jg L(Fill7) +L(FillLess12): /* but more than 8 */ + cmp $10, %ebx + jl L(Fill9) + je L(Fill10) + jmp L(Fill11) + + CFI_PUSH(%edi) + + .p2align 4 +L(StrncpyFillTailWithZero1): + POP (%edi) +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %edx, %edx + sub $16, %ebx + jbe L(StrncpyFillExit1) + + movlpd %xmm0, (%ecx) + movlpd %xmm0, 8(%ecx) + + lea 16(%ecx), %ecx + + mov %ecx, %edx + and $0xf, %edx + sub %edx, %ecx + add %edx, %ebx + xor %edx, %edx + sub $64, %ebx + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%ecx) + movdqa %xmm0, 16(%ecx) + movdqa %xmm0, 32(%ecx) + movdqa %xmm0, 48(%ecx) + lea 64(%ecx), %ecx + sub $64, %ebx + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %ebx + jl L(StrncpyFillLess32) + movdqa %xmm0, (%ecx) + movdqa %xmm0, 16(%ecx) + lea 32(%ecx), %ecx + sub $16, %ebx + jl L(StrncpyFillExit1) + movdqa %xmm0, (%ecx) + lea 16(%ecx), %ecx + jmp L(FillFrom1To16Bytes) + +L(StrncpyFillLess32): + add $16, %ebx + jl L(StrncpyFillExit1) + movdqa %xmm0, (%ecx) + lea 16(%ecx), %ecx + jmp L(FillFrom1To16Bytes) +# endif + + .p2align 4 +L(ExitTail1): + movb (%ecx), %al + movb %al, (%edx) + SAVE_RESULT_TAIL (0) +# ifdef USE_AS_STRNCPY + sub $1, %ebx + lea 1(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail2): + movw (%ecx), %ax + movw %ax, (%edx) + SAVE_RESULT_TAIL (1) +# ifdef USE_AS_STRNCPY + sub $2, %ebx + lea 2(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + SAVE_RESULT_TAIL (2) +# ifdef USE_AS_STRNCPY + sub $3, %ebx + lea 3(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT_TAIL (3) +# ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + SAVE_RESULT_TAIL (4) +# ifdef USE_AS_STRNCPY + sub $5, %ebx + lea 5(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + SAVE_RESULT_TAIL (5) +# ifdef USE_AS_STRNCPY + sub $6, %ebx + lea 6(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + SAVE_RESULT_TAIL (6) +# ifdef USE_AS_STRNCPY + sub $7, %ebx + lea 7(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + SAVE_RESULT_TAIL (7) +# ifdef USE_AS_STRNCPY + sub $8, %ebx + lea 8(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# endif + RETURN + + .p2align 4 +L(ExitTail9): + movlpd (%ecx), %xmm0 + movb 8(%ecx), %al + movlpd %xmm0, (%edx) + movb %al, 8(%edx) + SAVE_RESULT_TAIL (8) +# ifdef USE_AS_STRNCPY + sub $9, %ebx + lea 9(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail10): + movlpd (%ecx), %xmm0 + movw 8(%ecx), %ax + movlpd %xmm0, (%edx) + movw %ax, 8(%edx) + SAVE_RESULT_TAIL (9) +# ifdef USE_AS_STRNCPY + sub $10, %ebx + lea 10(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail11): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 7(%edx) + SAVE_RESULT_TAIL (10) +# ifdef USE_AS_STRNCPY + sub $11, %ebx + lea 11(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail12): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT_TAIL (11) +# ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail13): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + SAVE_RESULT_TAIL (12) +# ifdef USE_AS_STRNCPY + sub $13, %ebx + lea 13(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail14): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + SAVE_RESULT_TAIL (13) +# ifdef USE_AS_STRNCPY + sub $14, %ebx + lea 14(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail15): + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) + SAVE_RESULT_TAIL (14) +# ifdef USE_AS_STRNCPY + sub $15, %ebx + lea 15(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# endif + RETURN + + .p2align 4 +L(ExitTail16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + SAVE_RESULT_TAIL (15) +# ifdef USE_AS_STRNCPY + sub $16, %ebx + lea 16(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN +# endif + +# ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + CFI_PUSH (%esi) + CFI_PUSH (%edi) +# endif + .p2align 4 +L(StrncpyLeaveCase2OrCase3): + test %eax, %eax + jnz L(Aligned64LeaveCase2) + +L(Aligned64LeaveCase3): + add $48, %ebx + jle L(CopyFrom1To16BytesCase3) + movaps %xmm4, -64(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm5, -48(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm6, -32(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + jmp L(CopyFrom1To16BytesCase3) + +L(Aligned64LeaveCase2): + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm6, -32(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + jmp L(CopyFrom1To16BytesCase2) + +/*--------------------------------------------------*/ + .p2align 4 +L(StrncpyExit1Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) + mov $15, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit2Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + mov $14, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit3Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + mov $13, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit4Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + mov $12, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit5Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 7(%edx) + mov $11, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit6Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 6(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 6(%edx) + mov $10, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit7Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 5(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 5(%edx) + mov $9, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit8Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + mov $8, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit9Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + mov $7, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit10Case2OrCase3): + movlpd -1(%ecx), %xmm0 + movlpd %xmm0, -1(%edx) + mov $6, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit11Case2OrCase3): + movlpd -2(%ecx), %xmm0 + movlpd %xmm0, -2(%edx) + mov $5, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit12Case2OrCase3): + movl (%ecx), %esi + movl %esi, (%edx) + mov $4, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit13Case2OrCase3): + movl -1(%ecx), %esi + movl %esi, -1(%edx) + mov $3, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit14Case2OrCase3): + movl -2(%ecx), %esi + movl %esi, -2(%edx) + mov $2, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit15Case2OrCase3): + movl -3(%ecx), %esi + movl %esi, -3(%edx) + mov $1, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave1): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + palignr $1, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit1): + lea 15(%edx, %esi), %edx + lea 15(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave2): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + palignr $2, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit2): + lea 14(%edx, %esi), %edx + lea 14(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave3): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + palignr $3, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit3): + lea 13(%edx, %esi), %edx + lea 13(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave4): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + palignr $4, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit4): + lea 12(%edx, %esi), %edx + lea 12(%ecx, %esi), %ecx + movlpd -12(%ecx), %xmm0 + movl -4(%ecx), %eax + movlpd %xmm0, -12(%edx) + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave5): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + palignr $5, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit5): + lea 11(%edx, %esi), %edx + lea 11(%ecx, %esi), %ecx + movlpd -11(%ecx), %xmm0 + movl -4(%ecx), %eax + movlpd %xmm0, -11(%edx) + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave6): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + palignr $6, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit6): + lea 10(%edx, %esi), %edx + lea 10(%ecx, %esi), %ecx + + movlpd -10(%ecx), %xmm0 + movw -2(%ecx), %ax + movlpd %xmm0, -10(%edx) + movw %ax, -2(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave7): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + palignr $7, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit7): + lea 9(%edx, %esi), %edx + lea 9(%ecx, %esi), %ecx + + movlpd -9(%ecx), %xmm0 + movb -1(%ecx), %ah + movlpd %xmm0, -9(%edx) + movb %ah, -1(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave8): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + palignr $8, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit8): + lea 8(%edx, %esi), %edx + lea 8(%ecx, %esi), %ecx + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave9): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + palignr $9, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit9): + lea 7(%edx, %esi), %edx + lea 7(%ecx, %esi), %ecx + + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave10): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + palignr $10, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit10): + lea 6(%edx, %esi), %edx + lea 6(%ecx, %esi), %ecx + + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave11): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + palignr $11, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit11): + lea 5(%edx, %esi), %edx + lea 5(%ecx, %esi), %ecx + movl -5(%ecx), %esi + movb -1(%ecx), %ah + movl %esi, -5(%edx) + movb %ah, -1(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave12): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + palignr $12, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit12): + lea 4(%edx, %esi), %edx + lea 4(%ecx, %esi), %ecx + movl -4(%ecx), %eax + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave13): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + palignr $13, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit13): + lea 3(%edx, %esi), %edx + lea 3(%ecx, %esi), %ecx + + movl -4(%ecx), %eax + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave14): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + palignr $14, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit14): + lea 2(%edx, %esi), %edx + lea 2(%ecx, %esi), %ecx + movw -2(%ecx), %ax + movw %ax, -2(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave15): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + palignr $15, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit15): + lea 1(%edx, %esi), %edx + lea 1(%ecx, %esi), %ecx + movb -1(%ecx), %ah + movb %ah, -1(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) +# endif + +# ifndef USE_AS_STRCAT +# ifdef USE_AS_STRNCPY + CFI_POP (%esi) + CFI_POP (%edi) + + .p2align 4 +L(ExitTail0): + movl %edx, %eax + RETURN + + .p2align 4 +L(StrncpyExit15Bytes): + cmp $12, %ebx + jbe L(StrncpyExit12Bytes) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmp $13, %ebx + je L(ExitTail13) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmp $14, %ebx + je L(ExitTail14) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax +# else + movl %edx, %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit12Bytes): + cmp $9, %ebx + je L(ExitTail9) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmp $10, %ebx + je L(ExitTail10) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmp $11, %ebx + je L(ExitTail11) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT_TAIL (11) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit8Bytes): + cmp $4, %ebx + jbe L(StrncpyExit4Bytes) + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + + cmp $5, %ebx + je L(ExitTail5) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmp $6, %ebx + je L(ExitTail6) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmp $7, %ebx + je L(ExitTail7) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax +# else + movl %edx, %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit4Bytes): + test %ebx, %ebx + jz L(ExitTail0) + cmp $1, %ebx + je L(ExitTail1) + cmpb $0, (%ecx) + jz L(ExitTail1) + cmp $2, %ebx + je L(ExitTail2) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmp $3, %ebx + je L(ExitTail3) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT_TAIL (3) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif + RETURN +# endif + +END (STRCPY) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S new file mode 100644 index 0000000000..ffbc03c6d5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S @@ -0,0 +1,116 @@ +/* Multiple versions of strcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define STRCPY_IA32 __stpncpy_ia32 +# define __GI_STRCPY __GI_stpncpy +# define __GI___STRCPY __GI___stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define STRCPY_IA32 __stpcpy_ia32 +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define STRCPY_IA32 __strncpy_ia32 +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define STRCPY_IA32 __strcpy_ia32 +# define __GI_STRCPY __GI_strcpy +# endif +#endif + + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncpy in static library since we + need strncpy before the initialization happened. */ +#if IS_IN (libc) + + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (STRCPY_IA32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (STRCPY_SSE2) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (STRCPY_SSSE3) +2: ret +END(STRCPY) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_IA32, @function; \ + .align 16; \ + .globl STRCPY_IA32; \ + .hidden STRCPY_IA32; \ + STRCPY_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32 + +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# include "../../stpncpy.S" +# else +# include "../../i586/stpcpy.S" +# endif +#else +# ifndef USE_AS_STRNCPY +# include "../../i586/strcpy.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c new file mode 100644 index 0000000000..6d61e190a8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c @@ -0,0 +1,2 @@ +#define __strcspn_sse2 __strcspn_ia32 +#include <sysdeps/x86_64/multiarch/strcspn-c.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S new file mode 100644 index 0000000000..21e5093924 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S @@ -0,0 +1,75 @@ +/* Multiple versions of strcspn + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <init-arch.h> + +#ifdef USE_AS_STRPBRK +#define STRCSPN_SSE42 __strpbrk_sse42 +#define STRCSPN_IA32 __strpbrk_ia32 +#define __GI_STRCSPN __GI_strpbrk +#else +#ifndef STRCSPN +#define STRCSPN strcspn +#define STRCSPN_SSE42 __strcspn_sse42 +#define STRCSPN_IA32 __strcspn_ia32 +#define __GI_STRCSPN __GI_strcspn +#endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc) + .text +ENTRY(STRCSPN) + .type STRCSPN, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (STRCSPN_IA32) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (STRCSPN_SSE42) +2: ret +END(STRCSPN) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCSPN_IA32, @function; \ + .globl STRCSPN_IA32; \ + .p2align 4; \ + STRCSPN_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32 +#endif + +#ifdef USE_AS_STRPBRK +#include "../../strpbrk.S" +#else +#include "../../strcspn.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S new file mode 100644 index 0000000000..d3ea864bab --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S @@ -0,0 +1,125 @@ +/* strlen with SSE2 and BSF + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if defined SHARED && IS_IN (libc) + +#include <sysdep.h> + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) +#define PARMS 4 + 8 /* Preserve ESI and EDI. */ +#define STR PARMS +#define ENTRANCE PUSH (%esi); PUSH (%edi); cfi_remember_state +#define RETURN POP (%edi); POP (%esi); ret; \ + cfi_restore_state; cfi_remember_state + + .text +ENTRY ( __strlen_sse2_bsf) + ENTRANCE + mov STR(%esp), %edi + xor %eax, %eax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%edi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %edi, %eax + and $-16, %eax + jmp L(align16_start) +L(next): + + mov %edi, %eax + and $-16, %eax + pcmpeqb (%eax), %xmm0 + mov $-1, %esi + sub %eax, %ecx + shl %cl, %esi + pmovmskb %xmm0, %edx + and %esi, %edx + jnz L(exit) +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + .p2align 4 +L(align16_loop): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop) +L(exit): + sub %edi, %eax +L(exit_less16): + bsf %edx, %edx + add %edx, %eax + RETURN +L(exit16): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $16, %eax + RETURN +L(exit32): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $32, %eax + RETURN +L(exit48): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $48, %eax + POP (%edi) + POP (%esi) + ret + +END ( __strlen_sse2_bsf) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S new file mode 100644 index 0000000000..36fc1469d0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S @@ -0,0 +1,695 @@ +/* strlen with SSE2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */ + +#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc) + +# ifndef USE_AS_STRCAT + +# include <sysdep.h> +# define PARMS 4 +# define STR PARMS +# define RETURN ret + +# ifdef USE_AS_STRNLEN +# define LEN PARMS + 8 +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) +# undef RETURN +# define RETURN POP (%edi); CFI_PUSH(%edi); ret +# endif + +# ifndef STRLEN +# define STRLEN __strlen_sse2 +# endif + + atom_text_section +ENTRY (STRLEN) + mov STR(%esp), %edx +# ifdef USE_AS_STRNLEN + PUSH (%edi) + movl LEN(%esp), %edi + sub $4, %edi + jbe L(len_less4_prolog) +# endif +# endif + xor %eax, %eax + cmpb $0, (%edx) + jz L(exit_tail0) + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmpb $0, 3(%edx) + jz L(exit_tail3) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less8_prolog) +# endif + + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmpb $0, 7(%edx) + jz L(exit_tail7) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less12_prolog) +# endif + + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmpb $0, 11(%edx) + jz L(exit_tail11) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less16_prolog) +# endif + + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmpb $0, 15(%edx) + jz L(exit_tail15) + + pxor %xmm0, %xmm0 + lea 16(%edx), %eax + mov %eax, %ecx + and $-16, %eax + +# ifdef USE_AS_STRNLEN + and $15, %edx + add %edx, %edi + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + mov %eax, %edx + and $63, %edx + add %edx, %edi +# endif + + and $-0x40, %eax + + .p2align 4 +L(aligned_64_loop): +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + movaps (%eax), %xmm0 + movaps 16(%eax), %xmm1 + movaps 32(%eax), %xmm2 + movaps 48(%eax), %xmm6 + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqb %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%eax), %eax + jz L(aligned_64_loop) + + pcmpeqb -64(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%ecx), %ecx + jnz L(exit) + + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqb -32(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqb %xmm6, %xmm3 + pmovmskb %xmm3, %edx + lea -16(%ecx), %ecx +L(exit): + sub %ecx, %eax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(exit_tail1) + test $0x04, %dl + jnz L(exit_tail2) + add $3, %eax + RETURN + + .p2align 4 +L(exit_8): + test $0x10, %dl + jnz L(exit_tail4) + test $0x20, %dl + jnz L(exit_tail5) + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax + RETURN + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_high_8) + test $0x01, %dh + jnz L(exit_tail8) + test $0x02, %dh + jnz L(exit_tail9) + test $0x04, %dh + jnz L(exit_tail10) + add $11, %eax + RETURN + + .p2align 4 +L(exit_high_8): + test $0x10, %dh + jnz L(exit_tail12) + test $0x20, %dh + jnz L(exit_tail13) + test $0x40, %dh + jnz L(exit_tail14) + add $15, %eax +L(exit_tail0): + RETURN + +# ifdef USE_AS_STRNLEN + + .p2align 4 +L(len_less64): + pxor %xmm0, %xmm0 + add $64, %edi + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + movl LEN(%esp), %eax + RETURN + + .p2align 4 +L(strnlen_exit): + sub %ecx, %eax + + test %dl, %dl + jz L(strnlen_exit_high) + mov %dl, %cl + and $15, %cl + jz L(strnlen_exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(strnlen_exit_tail1) + test $0x04, %dl + jnz L(strnlen_exit_tail2) + sub $4, %edi + jb L(return_start_len) + lea 3(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_8): + test $0x10, %dl + jnz L(strnlen_exit_tail4) + test $0x20, %dl + jnz L(strnlen_exit_tail5) + test $0x40, %dl + jnz L(strnlen_exit_tail6) + sub $8, %edi + jb L(return_start_len) + lea 7(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_high): + mov %dh, %ch + and $15, %ch + jz L(strnlen_exit_high_8) + test $0x01, %dh + jnz L(strnlen_exit_tail8) + test $0x02, %dh + jnz L(strnlen_exit_tail9) + test $0x04, %dh + jnz L(strnlen_exit_tail10) + sub $12, %edi + jb L(return_start_len) + lea 11(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_high_8): + test $0x10, %dh + jnz L(strnlen_exit_tail12) + test $0x20, %dh + jnz L(strnlen_exit_tail13) + test $0x40, %dh + jnz L(strnlen_exit_tail14) + sub $16, %edi + jb L(return_start_len) + lea 15(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail1): + sub $2, %edi + jb L(return_start_len) + lea 1(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail2): + sub $3, %edi + jb L(return_start_len) + lea 2(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail4): + sub $5, %edi + jb L(return_start_len) + lea 4(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail5): + sub $6, %edi + jb L(return_start_len) + lea 5(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail6): + sub $7, %edi + jb L(return_start_len) + lea 6(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail8): + sub $9, %edi + jb L(return_start_len) + lea 8(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail9): + sub $10, %edi + jb L(return_start_len) + lea 9(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail10): + sub $11, %edi + jb L(return_start_len) + lea 10(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail12): + sub $13, %edi + jb L(return_start_len) + lea 12(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail13): + sub $14, %edi + jb L(return_start_len) + lea 13(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail14): + sub $15, %edi + jb L(return_start_len) + lea 14(%eax), %eax + RETURN + + .p2align 4 +L(return_start_len): + movl LEN(%esp), %eax + RETURN + +/* for prolog only */ + + .p2align 4 +L(len_less4_prolog): + xor %eax, %eax + + add $4, %edi + jz L(exit_tail0) + + cmpb $0, (%edx) + jz L(exit_tail0) + cmp $1, %edi + je L(exit_tail1) + + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmp $2, %edi + je L(exit_tail2) + + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmp $3, %edi + je L(exit_tail3) + + cmpb $0, 3(%edx) + jz L(exit_tail3) + mov $4, %eax + RETURN + + .p2align 4 +L(len_less8_prolog): + add $4, %edi + + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmp $1, %edi + je L(exit_tail5) + + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmp $2, %edi + je L(exit_tail6) + + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmp $3, %edi + je L(exit_tail7) + + cmpb $0, 7(%edx) + jz L(exit_tail7) + mov $8, %eax + RETURN + + + .p2align 4 +L(len_less12_prolog): + add $4, %edi + + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmp $1, %edi + je L(exit_tail9) + + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmp $2, %edi + je L(exit_tail10) + + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmp $3, %edi + je L(exit_tail11) + + cmpb $0, 11(%edx) + jz L(exit_tail11) + mov $12, %eax + RETURN + + .p2align 4 +L(len_less16_prolog): + add $4, %edi + + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmp $1, %edi + je L(exit_tail13) + + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmp $2, %edi + je L(exit_tail14) + + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmp $3, %edi + je L(exit_tail15) + + cmpb $0, 15(%edx) + jz L(exit_tail15) + mov $16, %eax + RETURN +# endif + + .p2align 4 +L(exit_tail1): + add $1, %eax + RETURN + +L(exit_tail2): + add $2, %eax + RETURN + +L(exit_tail3): + add $3, %eax + RETURN + +L(exit_tail4): + add $4, %eax + RETURN + +L(exit_tail5): + add $5, %eax + RETURN + +L(exit_tail6): + add $6, %eax + RETURN + +L(exit_tail7): + add $7, %eax + RETURN + +L(exit_tail8): + add $8, %eax + RETURN + +L(exit_tail9): + add $9, %eax + RETURN + +L(exit_tail10): + add $10, %eax + RETURN + +L(exit_tail11): + add $11, %eax + RETURN + +L(exit_tail12): + add $12, %eax + RETURN + +L(exit_tail13): + add $13, %eax + RETURN + +L(exit_tail14): + add $14, %eax + RETURN + +L(exit_tail15): + add $15, %eax +# ifndef USE_AS_STRCAT + RETURN +END (STRLEN) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S new file mode 100644 index 0000000000..77cf6bcdb0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S @@ -0,0 +1,60 @@ +/* Multiple versions of strlen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc and for the + DSO. In static binaries, we need strlen before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(strlen) + .type strlen, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strlen_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf) + HAS_ARCH_FEATURE (Slow_BSF) + jz 2f + LOAD_FUNC_GOT_EAX (__strlen_sse2) +2: ret +END(strlen) + +# undef ENTRY +# define ENTRY(name) \ + .type __strlen_ia32, @function; \ + .globl __strlen_ia32; \ + .p2align 4; \ + __strlen_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strlen_ia32, .-__strlen_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strlen; __GI_strlen = __strlen_ia32 +#endif + +#include "../../i586/strlen.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c new file mode 100644 index 0000000000..76581eb62b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c @@ -0,0 +1,8 @@ +#include <string.h> + +extern __typeof (strncasecmp) __strncasecmp_nonascii; + +#define __strncasecmp __strncasecmp_nonascii +#include <string/strncase.c> + +strong_alias (__strncasecmp_nonascii, __strncasecmp_ia32) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S new file mode 100644 index 0000000000..a56e63a566 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S @@ -0,0 +1,39 @@ +/* Entry point for multi-version x86 strncasecmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY(__strncasecmp) + .type __strncasecmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strncasecmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2) +2: ret +END(__strncasecmp) + +weak_alias (__strncasecmp, strncasecmp) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c new file mode 100644 index 0000000000..7e601af271 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c @@ -0,0 +1,13 @@ +#include <string.h> + +extern __typeof (strncasecmp_l) __strncasecmp_l_nonascii; + +#define __strncasecmp_l __strncasecmp_l_nonascii +#define USE_IN_EXTENDED_LOCALE_MODEL 1 +#include <string/strncase.c> + +strong_alias (__strncasecmp_l_nonascii, __strncasecmp_l_ia32) + +/* The needs of strcasecmp in libc are minimal, no need to go through + the IFUNC. */ +strong_alias (__strncasecmp_l_nonascii, __GI___strncasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S new file mode 100644 index 0000000000..557210832e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S @@ -0,0 +1,2 @@ +#define USE_AS_STRNCASECMP_L 1 +#include "strcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S new file mode 100644 index 0000000000..d438a1ae35 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S @@ -0,0 +1,2 @@ +#define USE_AS_STRNCASECMP_L 1 +#include "strcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S new file mode 100644 index 0000000000..8a74ee8574 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S @@ -0,0 +1,7 @@ +/* Multiple versions of strncasecmp_l + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP __strncasecmp_l +#define USE_AS_STRNCASECMP_L +#include "strcmp.S" + +weak_alias (__strncasecmp_l, strncasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c new file mode 100644 index 0000000000..132a000545 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c @@ -0,0 +1,8 @@ +#define STRNCAT __strncat_ia32 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32); +#endif + +#include "string/strncat.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S new file mode 100644 index 0000000000..f1045b72b8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S @@ -0,0 +1,4 @@ +#define STRCAT __strncat_sse2 +#define USE_AS_STRNCAT + +#include "strcat-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S new file mode 100644 index 0000000000..625b90a978 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S @@ -0,0 +1,4 @@ +#define STRCAT __strncat_ssse3 +#define USE_AS_STRNCAT + +#include "strcat-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S new file mode 100644 index 0000000000..5c1bf41453 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncat + All versions must be listed in ifunc-impl-list.c. */ +#define STRCAT strncat +#define USE_AS_STRNCAT +#include "strcat.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c new file mode 100644 index 0000000000..cc059da494 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c @@ -0,0 +1,8 @@ +#ifdef SHARED +# define STRNCMP __strncmp_ia32 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32); +#endif + +#include "string/strncmp.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S new file mode 100644 index 0000000000..cf14dfaf6c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S @@ -0,0 +1,5 @@ +#ifdef SHARED +# define USE_AS_STRNCMP +# define STRCMP __strncmp_sse4_2 +# include "strcmp-sse4.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S new file mode 100644 index 0000000000..536c8685f2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S @@ -0,0 +1,5 @@ +#ifdef SHARED +# define USE_AS_STRNCMP +# define STRCMP __strncmp_ssse3 +# include "strcmp-ssse3.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S new file mode 100644 index 0000000000..150d4786d2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncmp + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STRNCMP +#define STRCMP strncmp +#include "strcmp.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c new file mode 100644 index 0000000000..201e3f98b3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_ia32 +#ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32); +#endif + +#include "string/strncpy.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S new file mode 100644 index 0000000000..bdd99239a4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_sse2 +#include "strcpy-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S new file mode 100644 index 0000000000..bf82ee447d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S new file mode 100644 index 0000000000..9c257efc6e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncpy + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STRNCPY +#define STRCPY strncpy +#include "strcpy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c new file mode 100644 index 0000000000..351e939a93 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c @@ -0,0 +1,10 @@ +#define STRNLEN __strnlen_ia32 +#ifdef SHARED +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); \ + strong_alias (__strnlen_ia32, __strnlen_ia32_1); \ + __hidden_ver1 (__strnlen_ia32_1, __GI___strnlen, __strnlen_ia32_1); +#endif + +#include "string/strnlen.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S new file mode 100644 index 0000000000..56b6ae2a5c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNLEN +#define STRLEN __strnlen_sse2 +#include "strlen-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S new file mode 100644 index 0000000000..d241522c70 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S @@ -0,0 +1,37 @@ +/* Multiple versions of strnlen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__strnlen) + .type __strnlen, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strnlen_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strnlen_sse2) +2: ret +END(__strnlen) + +weak_alias(__strnlen, strnlen) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c new file mode 100644 index 0000000000..5db62053b3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c @@ -0,0 +1,2 @@ +#define __strpbrk_sse2 __strpbrk_ia32 +#include <sysdeps/x86_64/multiarch/strpbrk-c.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S new file mode 100644 index 0000000000..7201d6376f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S @@ -0,0 +1,5 @@ +/* Multiple versions of strpbrk + All versions must be listed in ifunc-impl-list.c. */ +#define STRCSPN strpbrk +#define USE_AS_STRPBRK +#include "strcspn.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S new file mode 100644 index 0000000000..39a7c8825b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S @@ -0,0 +1,282 @@ +/* strrchr with SSE2 with bsf and bsr + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + + .text +ENTRY (__strrchr_sse2_bsf) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + PUSH (%edi) + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + pshufd $0, %xmm1, %xmm1 + ja L(crosscashe) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + + test %eax, %eax + jnz L(unaligned_match1) + + test %edx, %edx + jnz L(return_null) + + and $-16, %edi + add $16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_return_value1): + bsf %edx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_null) + bsr %eax, %eax + add %edi, %eax + POP (%edi) + ret + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_match1): + test %edx, %edx + jnz L(unaligned_return_value1) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + lea 16(%edi), %esi + and $-16, %edi + add $16, %edi + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 + L(crosscashe): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm3, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + add $16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_return_value): + add %ecx, %edi + bsf %edx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_null) + bsr %eax, %eax + add %edi, %eax + POP (%edi) + ret + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(unaligned_return_value) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + add $16, %edi + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jz L(loop) + +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %ebx, %ebx + jz L(return_null_1) + bsr %ebx, %eax + add %esi, %eax + + POP (%ebx) + POP (%esi) + + sub $16, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(return_value_1) + mov %eax, %ebx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(return_value_1): + bsf %ecx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + + bsr %eax, %eax + add %edi, %eax + sub $16, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + CFI_PUSH (%ebx) + CFI_PUSH (%esi) +/* Return NULL. */ + .p2align 4 +L(return_null_1): + POP (%ebx) + POP (%esi) + POP (%edi) + xor %eax, %eax + ret + +END (__strrchr_sse2_bsf) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S new file mode 100644 index 0000000000..20934288be --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S @@ -0,0 +1,708 @@ +/* strrchr SSE2 without bsf and bsr + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi); +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__strrchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + pshufd $0, %xmm1, %xmm1 + ja L(crosscache) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %ecx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match1) + + test %ecx, %ecx + jnz L(return_null) + + and $-16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_match1): + test %ecx, %ecx + jnz L(prolog_find_zero_1) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + mov %edi, %esi + and $-16, %edi + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(crosscache): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm3, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(prolog_find_zero) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jz L(loop) + +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %ebx, %ebx + jz L(return_null_1) + mov %ebx, %eax + mov %esi, %edi + + POP (%ebx) + POP (%esi) + + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(return_null_1): + POP (%ebx) + POP (%esi) + + xor %eax, %eax + RETURN + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(find_zero) + mov %eax, %ebx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(find_zero): + test %cl, %cl + jz L(find_zero_high) + mov %cl, %dl + and $15, %dl + jz L(find_zero_8) + test $0x01, %cl + jnz L(FindZeroExit1) + test $0x02, %cl + jnz L(FindZeroExit2) + test $0x04, %cl + jnz L(FindZeroExit3) + and $1 << 4 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_8): + test $0x10, %cl + jnz L(FindZeroExit5) + test $0x20, %cl + jnz L(FindZeroExit6) + test $0x40, %cl + jnz L(FindZeroExit7) + and $1 << 8 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_high): + mov %ch, %dh + and $15, %dh + jz L(find_zero_high_8) + test $0x01, %ch + jnz L(FindZeroExit9) + test $0x02, %ch + jnz L(FindZeroExit10) + test $0x04, %ch + jnz L(FindZeroExit11) + and $1 << 12 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_high_8): + test $0x10, %ch + jnz L(FindZeroExit13) + test $0x20, %ch + jnz L(FindZeroExit14) + test $0x40, %ch + jnz L(FindZeroExit15) + and $1 << 16 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit1): + and $1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit2): + and $1 << 2 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit3): + and $1 << 3 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit5): + and $1 << 5 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit6): + and $1 << 6 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit7): + and $1 << 7 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit9): + and $1 << 9 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit10): + and $1 << 10 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit11): + and $1 << 11 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit13): + and $1 << 13 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit14): + and $1 << 14 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit15): + and $1 << 15 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + + .p2align 4 +L(match_exit): + test %ah, %ah + jnz L(match_exit_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(match_exit_8) + test $0x08, %al + jnz L(Exit4) + test $0x04, %al + jnz L(Exit3) + test $0x02, %al + jnz L(Exit2) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(match_exit_8): + test $0x80, %al + jnz L(Exit8) + test $0x40, %al + jnz L(Exit7) + test $0x20, %al + jnz L(Exit6) + lea -12(%edi), %eax + RETURN + + .p2align 4 +L(match_exit_high): + mov %ah, %dh + and $15 << 4, %dh + jnz L(match_exit_high_8) + test $0x08, %ah + jnz L(Exit12) + test $0x04, %ah + jnz L(Exit11) + test $0x02, %ah + jnz L(Exit10) + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_exit_high_8): + test $0x80, %ah + jnz L(Exit16) + test $0x40, %ah + jnz L(Exit15) + test $0x20, %ah + jnz L(Exit14) + lea -4(%edi), %eax + RETURN + + .p2align 4 +L(Exit2): + lea -15(%edi), %eax + RETURN + + .p2align 4 +L(Exit3): + lea -14(%edi), %eax + RETURN + + .p2align 4 +L(Exit4): + lea -13(%edi), %eax + RETURN + + .p2align 4 +L(Exit6): + lea -11(%edi), %eax + RETURN + + .p2align 4 +L(Exit7): + lea -10(%edi), %eax + RETURN + + .p2align 4 +L(Exit8): + lea -9(%edi), %eax + RETURN + + .p2align 4 +L(Exit10): + lea -7(%edi), %eax + RETURN + + .p2align 4 +L(Exit11): + lea -6(%edi), %eax + RETURN + + .p2align 4 +L(Exit12): + lea -5(%edi), %eax + RETURN + + .p2align 4 +L(Exit14): + lea -3(%edi), %eax + RETURN + + .p2align 4 +L(Exit15): + lea -2(%edi), %eax + RETURN + + .p2align 4 +L(Exit16): + lea -1(%edi), %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero): + add %ecx, %edi + mov %edx, %ecx +L(prolog_find_zero_1): + test %cl, %cl + jz L(prolog_find_zero_high) + mov %cl, %dl + and $15, %dl + jz L(prolog_find_zero_8) + test $0x01, %cl + jnz L(PrologFindZeroExit1) + test $0x02, %cl + jnz L(PrologFindZeroExit2) + test $0x04, %cl + jnz L(PrologFindZeroExit3) + and $1 << 4 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero_8): + test $0x10, %cl + jnz L(PrologFindZeroExit5) + test $0x20, %cl + jnz L(PrologFindZeroExit6) + test $0x40, %cl + jnz L(PrologFindZeroExit7) + and $1 << 8 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero_high): + mov %ch, %dh + and $15, %dh + jz L(prolog_find_zero_high_8) + test $0x01, %ch + jnz L(PrologFindZeroExit9) + test $0x02, %ch + jnz L(PrologFindZeroExit10) + test $0x04, %ch + jnz L(PrologFindZeroExit11) + and $1 << 12 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero_high_8): + test $0x10, %ch + jnz L(PrologFindZeroExit13) + test $0x20, %ch + jnz L(PrologFindZeroExit14) + test $0x40, %ch + jnz L(PrologFindZeroExit15) + and $1 << 16 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit1): + and $1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit2): + and $1 << 2 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit3): + and $1 << 3 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit5): + and $1 << 5 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit6): + and $1 << 6 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit7): + and $1 << 7 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit9): + and $1 << 9 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit10): + and $1 << 10 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit11): + and $1 << 11 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit13): + and $1 << 13 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit14): + and $1 << 14 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit15): + and $1 << 15 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + +END (__strrchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S new file mode 100644 index 0000000000..d9281eaeae --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S @@ -0,0 +1,57 @@ +/* Multiple versions of strrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(strrchr) + .type strrchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strrchr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf) + HAS_ARCH_FEATURE (Slow_BSF) + jz 2f + LOAD_FUNC_GOT_EAX (__strrchr_sse2) +2: ret +END(strrchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __strrchr_ia32, @function; \ + .globl __strrchr_ia32; \ + .p2align 4; \ + __strrchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strrchr_ia32, .-__strrchr_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strrchr; __GI_strrchr = __strrchr_ia32 +#endif + +#include "../../strrchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c new file mode 100644 index 0000000000..bea09dea71 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c @@ -0,0 +1,2 @@ +#define __strspn_sse2 __strspn_ia32 +#include <sysdeps/x86_64/multiarch/strspn-c.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S new file mode 100644 index 0000000000..1269062381 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S @@ -0,0 +1,56 @@ +/* Multiple versions of strspn + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(strspn) + .type strspn, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strspn_ia32) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (__strspn_sse42) +2: ret +END(strspn) + +# undef ENTRY +# define ENTRY(name) \ + .type __strspn_ia32, @function; \ + .globl __strspn_ia32; \ + .p2align 4; \ +__strspn_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strspn_ia32, .-__strspn_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strspn; __GI_strspn = __strspn_ia32 +#endif + +#include "../../strspn.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c new file mode 100644 index 0000000000..593cfec273 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/test-multiarch.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c new file mode 100644 index 0000000000..7760b966e2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/varshift.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h new file mode 100644 index 0000000000..7c72c70d67 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/varshift.h> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c new file mode 100644 index 0000000000..38d41d04de --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c @@ -0,0 +1,22 @@ +#include <wchar.h> + +#if IS_IN (libc) +# undef libc_hidden_weak +# define libc_hidden_weak(name) + +# undef weak_alias +# define weak_alias(name,alias) + +# ifdef SHARED +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); \ + strong_alias (__wcschr_ia32, __wcschr_ia32_1); \ + __hidden_ver1 (__wcschr_ia32_1, __GI___wcschr, __wcschr_ia32_1); +# endif +#endif + +extern __typeof (wcschr) __wcschr_ia32; + +#define WCSCHR __wcschr_ia32 +#include <wcsmbs/wcschr.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S new file mode 100644 index 0000000000..9ff6c3b8d6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S @@ -0,0 +1,219 @@ +/* wcschr with SSE2, without using bsf instructions + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcschr_sse2) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %eax + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + + and $63, %eax + cmp $48, %eax + ja L(cross_cache) + + movdqu (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + and $-16, %ecx + jmp L(loop) + + .p2align 4 +L(cross_cache): + PUSH (%edi) + mov %ecx, %edi + mov %eax, %ecx + and $-16, %edi + and $15, %ecx + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + jz L(unaligned_no_match) + + add %edi, %ecx + POP (%edi) + + test %edx, %edx + jz L(match_case1) + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_no_match): + mov %edi, %ecx + POP (%edi) + + test %edx, %edx + jnz L(return_null) + + pxor %xmm2, %xmm2 + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + add $16, %ecx + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jz L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %edx + test %eax, %eax + jz L(return_null) + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_case2_4): + mov %ecx, %eax + ret + + .p2align 4 +L(match_higth_case2): + test %dl, %dl + jnz L(return_null) + test $15, %ah + jnz L(match_case2_12) + test $15, %dh + jnz L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(match_case2_12): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_higth_case1) + + test $0x01, %al + jnz L(exit0) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_higth_case1): + test $0x01, %ah + jnz L(exit3) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit0): + mov %ecx, %eax + ret + + .p2align 4 +L(exit3): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + +END (__wcschr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S new file mode 100644 index 0000000000..d3c65a6436 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S @@ -0,0 +1,36 @@ +/* Multiple versions of wcschr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__wcschr) + .type wcschr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcschr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcschr_sse2) +2: ret +END(__wcschr) +weak_alias (__wcschr, wcschr) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c new file mode 100644 index 0000000000..e3337d77e2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c @@ -0,0 +1,14 @@ +#include <wchar.h> + +#define WCSCMP __wcscmp_ia32 +#ifdef SHARED +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__wcscmp_ia32, __GI___wcscmp, __wcscmp_ia32); +#endif +#undef weak_alias +#define weak_alias(name, alias) + +extern __typeof (wcscmp) __wcscmp_ia32; + +#include "wcsmbs/wcscmp.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S new file mode 100644 index 0000000000..a464b58204 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S @@ -0,0 +1,1018 @@ +/* wcscmp with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define ENTRANCE PUSH(%esi); PUSH(%edi) +# define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi); +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + +/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */ + + .text +ENTRY (__wcscmp_sse2) +/* + * This implementation uses SSE to compare up to 16 bytes at a time. +*/ + mov STR1(%esp), %edx + mov STR2(%esp), %eax + + mov (%eax), %ecx + cmp %ecx, (%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + mov 4(%eax), %ecx + cmp %ecx, 4(%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + mov 8(%eax), %ecx + cmp %ecx, 8(%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + mov 12(%eax), %ecx + cmp %ecx, 12(%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + ENTRANCE + add $16, %eax + add $16, %edx + + mov %eax, %esi + mov %edx, %edi + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + mov %al, %ch + mov %dl, %cl + and $63, %eax /* esi alignment in cache line */ + and $63, %edx /* edi alignment in cache line */ + and $15, %cl + jz L(continue_00) + cmp $16, %edx + jb L(continue_0) + cmp $32, %edx + jb L(continue_16) + cmp $48, %edx + jb L(continue_32) + +L(continue_48): + and $15, %ch + jz L(continue_48_00) + cmp $16, %eax + jb L(continue_0_48) + cmp $32, %eax + jb L(continue_16_48) + cmp $48, %eax + jb L(continue_32_48) + + .p2align 4 +L(continue_48_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%edi), %xmm1 + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_48_48) + +L(continue_0): + and $15, %ch + jz L(continue_0_00) + cmp $16, %eax + jb L(continue_0_0) + cmp $32, %eax + jb L(continue_0_16) + cmp $48, %eax + jb L(continue_0_32) + + .p2align 4 +L(continue_0_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + mov 48(%esi), %ecx + cmp %ecx, 48(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 52(%esi), %ecx + cmp %ecx, 52(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 56(%esi), %ecx + cmp %ecx, 56(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 60(%esi), %ecx + cmp %ecx, 60(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + add $64, %esi + add $64, %edi + jmp L(continue_0_48) + + .p2align 4 +L(continue_00): + and $15, %ch + jz L(continue_00_00) + cmp $16, %eax + jb L(continue_00_0) + cmp $32, %eax + jb L(continue_00_16) + cmp $48, %eax + jb L(continue_00_32) + + .p2align 4 +L(continue_00_48): + pcmpeqd (%edi), %xmm0 + mov (%edi), %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(less4_double_words1) + + cmp (%esi), %eax + jne L(nequal) + + mov 4(%edi), %eax + cmp 4(%esi), %eax + jne L(nequal) + + mov 8(%edi), %eax + cmp 8(%esi), %eax + jne L(nequal) + + mov 12(%edi), %eax + cmp 12(%esi), %eax + jne L(nequal) + + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_32): + and $15, %ch + jz L(continue_32_00) + cmp $16, %eax + jb L(continue_0_32) + cmp $32, %eax + jb L(continue_16_32) + cmp $48, %eax + jb L(continue_32_32) + + .p2align 4 +L(continue_32_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 16(%esi), %ecx + cmp %ecx, 16(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 20(%esi), %ecx + cmp %ecx, 20(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 24(%esi), %ecx + cmp %ecx, 24(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 28(%esi), %ecx + cmp %ecx, 28(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%edi), %xmm1 + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results */ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_32_48) + + .p2align 4 +L(continue_16): + and $15, %ch + jz L(continue_16_00) + cmp $16, %eax + jb L(continue_0_16) + cmp $32, %eax + jb L(continue_16_16) + cmp $48, %eax + jb L(continue_16_32) + + .p2align 4 +L(continue_16_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + mov 32(%esi), %ecx + cmp %ecx, 32(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 36(%esi), %ecx + cmp %ecx, 36(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 40(%esi), %ecx + cmp %ecx, 40(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 44(%esi), %ecx + cmp %ecx, 44(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 48(%edi), %xmm1 + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_16_48) + + .p2align 4 +L(continue_00_00): + movdqa (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqa 16(%edi), %xmm3 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqa 32(%edi), %xmm5 + pcmpeqd %xmm5, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%esi), %xmm5 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm5 /* packed sub of comparison results*/ + pmovmskb %xmm5, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqa 48(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_00_00) + + .p2align 4 +L(continue_00_32): + movdqu (%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_00_16): + movdqu (%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_00_0): + movdqu (%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %esi + add $48, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_48_00): + pcmpeqd (%esi), %xmm0 + mov (%edi), %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(less4_double_words1) + + cmp (%esi), %eax + jne L(nequal) + + mov 4(%edi), %eax + cmp 4(%esi), %eax + jne L(nequal) + + mov 8(%edi), %eax + cmp 8(%esi), %eax + jne L(nequal) + + mov 12(%edi), %eax + cmp 12(%esi), %eax + jne L(nequal) + + movdqu 16(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_32_00): + movdqu (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_16_00): + movdqu (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_0_00): + movdqu (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %esi + add $48, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_32_32): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_48_48) + + .p2align 4 +L(continue_16_16): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm3 + movdqu 16(%esi), %xmm4 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_48_48) + + .p2align 4 +L(continue_0_0): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm3 + movdqu 16(%esi), %xmm4 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %esi + add $48, %edi + jmp L(continue_48_48) + + .p2align 4 +L(continue_0_16): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_32_48) + + .p2align 4 +L(continue_0_32): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_16_48) + + .p2align 4 +L(continue_16_32): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_32_48) + + .p2align 4 +L(less4_double_words1): + cmp (%esi), %eax + jne L(nequal) + test %eax, %eax + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + xor %eax, %eax + RETURN + + .p2align 4 +L(less4_double_words): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(less4_double_words_16): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_16) + and $15, %dl + jz L(second_double_word_16) + mov 16(%esi), %ecx + cmp %ecx, 16(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word_16): + mov 20(%esi), %ecx + cmp %ecx, 20(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words_16): + and $15, %dh + jz L(fourth_double_word_16) + mov 24(%esi), %ecx + cmp %ecx, 24(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word_16): + mov 28(%esi), %ecx + cmp %ecx, 28(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(less4_double_words_32): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_32) + and $15, %dl + jz L(second_double_word_32) + mov 32(%esi), %ecx + cmp %ecx, 32(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word_32): + mov 36(%esi), %ecx + cmp %ecx, 36(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words_32): + and $15, %dh + jz L(fourth_double_word_32) + mov 40(%esi), %ecx + cmp %ecx, 40(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word_32): + mov 44(%esi), %ecx + cmp %ecx, 44(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(less4_double_words_48): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_48) + and $15, %dl + jz L(second_double_word_48) + mov 48(%esi), %ecx + cmp %ecx, 48(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word_48): + mov 52(%esi), %ecx + cmp %ecx, 52(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words_48): + and $15, %dh + jz L(fourth_double_word_48) + mov 56(%esi), %ecx + cmp %ecx, 56(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word_48): + mov 60(%esi), %ecx + cmp %ecx, 60(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(return) + neg %eax + RETURN + + .p2align 4 +L(return): + RETURN + + .p2align 4 +L(equal): + xorl %eax, %eax + RETURN + + CFI_POP (%edi) + CFI_POP (%esi) + + .p2align 4 +L(neq): + mov $1, %eax + jg L(neq_bigger) + neg %eax + +L(neq_bigger): + ret + + .p2align 4 +L(eq): + xorl %eax, %eax + ret + +END (__wcscmp_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S new file mode 100644 index 0000000000..7118bdd4db --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S @@ -0,0 +1,39 @@ +/* Multiple versions of wcscmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc and for the + DSO. In static binaries, we need wcscmp before the initialization + happened. */ +#if IS_IN (libc) + .text +ENTRY(__wcscmp) + .type __wcscmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcscmp_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcscmp_sse2) +2: ret +END(__wcscmp) +weak_alias (__wcscmp, wcscmp) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c new file mode 100644 index 0000000000..fb3000392b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c @@ -0,0 +1,5 @@ +#if IS_IN (libc) +# define wcscpy __wcscpy_ia32 +#endif + +#include "wcsmbs/wcscpy.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S new file mode 100644 index 0000000000..6280ba92ab --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S @@ -0,0 +1,600 @@ +/* wcscpy with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define RETURN POP (%edi); ret; CFI_PUSH (%edi) +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + + atom_text_section +ENTRY (__wcscpy_ssse3) + mov STR1(%esp), %edx + mov STR2(%esp), %ecx + + cmp $0, (%ecx) + jz L(ExitTail4) + cmp $0, 4(%ecx) + jz L(ExitTail8) + cmp $0, 8(%ecx) + jz L(ExitTail12) + cmp $0, 12(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + mov %edx, %edi + PUSH (%esi) + lea 16(%ecx), %esi + + and $-16, %esi + + pxor %xmm0, %xmm0 + pcmpeqd (%esi), %xmm0 + movdqu (%ecx), %xmm1 + movdqu %xmm1, (%edx) + + pmovmskb %xmm0, %eax + sub %ecx, %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %edx, %eax + lea 16(%edx), %edx + and $-16, %edx + sub %edx, %eax + + sub %eax, %ecx + mov %ecx, %eax + and $0xf, %eax + mov $0, %esi + + jz L(Align16Both) + cmp $4, %eax + je L(Shl4) + cmp $8, %eax + je L(Shl8) + jmp L(Shl12) + +L(Align16Both): + movaps (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movaps %xmm1, (%edx) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm4 + movaps %xmm3, (%edx, %esi) + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm1 + movaps %xmm4, (%edx, %esi) + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm2 + movaps %xmm1, (%edx, %esi) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%edx, %esi) + mov %ecx, %eax + lea 16(%ecx, %esi), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx + + mov $-0x40, %esi + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps 32(%ecx), %xmm3 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + lea 64(%edx), %edx + pcmpeqd %xmm0, %xmm3 + lea 64(%ecx), %ecx + pmovmskb %xmm3, %eax + + test %eax, %eax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%edx) + movaps %xmm5, -48(%edx) + movaps %xmm6, -32(%edx) + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%edx) + pcmpeqd %xmm7, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + mov $-0x40, %esi + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + + .p2align 4 +L(Shl4): + movaps -4(%ecx), %xmm1 + movaps 12(%ecx), %xmm2 +L(Shl4Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm1 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 28(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -12(%ecx), %ecx + sub %eax, %edx + + movaps -4(%ecx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%ecx), %xmm2 + movaps 28(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %eax, %eax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) + + palignr $4, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + POP (%esi) + add $12, %edx + add $12, %ecx + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(Shl8): + movaps -8(%ecx), %xmm1 + movaps 8(%ecx), %xmm2 +L(Shl8Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm1 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 24(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -8(%ecx), %ecx + sub %eax, %edx + + movaps -8(%ecx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%ecx), %xmm2 + movaps 24(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %eax, %eax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) + + palignr $8, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + POP (%esi) + add $8, %edx + add $8, %ecx + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(Shl12): + movaps -12(%ecx), %xmm1 + movaps 4(%ecx), %xmm2 +L(Shl12Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm1 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 20(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -4(%ecx), %ecx + sub %eax, %edx + + movaps -12(%ecx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%ecx), %xmm2 + movaps 20(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %eax, %eax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) + + palignr $12, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movl (%ecx), %esi + movl %esi, (%edx) + mov $4, %esi + + .p2align 4 +L(CopyFrom1To16Bytes): + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit12) +L(Exit16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edi, %eax + RETURN + +CFI_POP (%edi) + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + movl %edx, %eax + ret + +END (__wcscpy_ssse3) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S new file mode 100644 index 0000000000..cfc97dd87c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S @@ -0,0 +1,36 @@ +/* Multiple versions of wcscpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(wcscpy) + .type wcscpy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcscpy_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__wcscpy_ssse3) +2: ret +END(wcscpy) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c new file mode 100644 index 0000000000..a335dc0f7e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c @@ -0,0 +1,9 @@ +#include <wchar.h> + +#if IS_IN (libc) +# define WCSLEN __wcslen_ia32 +#endif + +extern __typeof (wcslen) __wcslen_ia32; + +#include "wcsmbs/wcslen.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S new file mode 100644 index 0000000000..bd3fc4c79b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S @@ -0,0 +1,193 @@ +/* wcslen with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> +# define STR 4 + + .text +ENTRY (__wcslen_sse2) + mov STR(%esp), %edx + + cmp $0, (%edx) + jz L(exit_tail0) + cmp $0, 4(%edx) + jz L(exit_tail1) + cmp $0, 8(%edx) + jz L(exit_tail2) + cmp $0, 12(%edx) + jz L(exit_tail3) + cmp $0, 16(%edx) + jz L(exit_tail4) + cmp $0, 20(%edx) + jz L(exit_tail5) + cmp $0, 24(%edx) + jz L(exit_tail6) + cmp $0, 28(%edx) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%edx), %eax + lea 16(%edx), %ecx + and $-16, %eax + + pcmpeqd (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + and $-0x40, %eax + + .p2align 4 +L(aligned_64_loop): + movaps (%eax), %xmm0 + movaps 16(%eax), %xmm1 + movaps 32(%eax), %xmm2 + movaps 48(%eax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%eax), %eax + jz L(aligned_64_loop) + + pcmpeqd -64(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%ecx), %ecx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqd -32(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + jmp L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %ecx, %eax + shr $2, %eax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_1) + ret + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_3) + add $2, %eax + ret + + .p2align 4 +L(exit_1): + add $1, %eax + ret + + .p2align 4 +L(exit_3): + add $3, %eax + ret + + .p2align 4 +L(exit_tail0): + xor %eax, %eax + ret + + .p2align 4 +L(exit_tail1): + mov $1, %eax + ret + + .p2align 4 +L(exit_tail2): + mov $2, %eax + ret + + .p2align 4 +L(exit_tail3): + mov $3, %eax + ret + + .p2align 4 +L(exit_tail4): + mov $4, %eax + ret + + .p2align 4 +L(exit_tail5): + mov $5, %eax + ret + + .p2align 4 +L(exit_tail6): + mov $6, %eax + ret + + .p2align 4 +L(exit_tail7): + mov $7, %eax + ret + +END (__wcslen_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S new file mode 100644 index 0000000000..6ef9b6e7b5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S @@ -0,0 +1,37 @@ +/* Multiple versions of wcslen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__wcslen) + .type __wcslen, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcslen_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcslen_sse2) +2: ret +END(__wcslen) + +weak_alias(__wcslen, wcslen) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c new file mode 100644 index 0000000000..8d8a335b5b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c @@ -0,0 +1,5 @@ +#if IS_IN (libc) +# define wcsrchr __wcsrchr_ia32 +#endif + +#include "wcsmbs/wcsrchr.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S new file mode 100644 index 0000000000..1a9b60e55e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S @@ -0,0 +1,354 @@ +/* wcsrchr with SSE2, without using bsf instructions. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH (%edi); +# define RETURN POP (%edi); ret; CFI_PUSH (%edi); +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcsrchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %edi + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + +/* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + ja L(crosscache) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 +/* Find where NULL is. */ + pmovmskb %xmm2, %ecx +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match1) + + test %ecx, %ecx + jnz L(return_null) + + and $-16, %edi + + PUSH (%esi) + + xor %edx, %edx + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(unaligned_match1): + test %ecx, %ecx + jnz L(prolog_find_zero_1) + + PUSH (%esi) + +/* Save current match */ + mov %eax, %edx + mov %edi, %esi + and $-16, %edi + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(crosscache): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm3 + pcmpeqd %xmm1, %xmm0 +/* Find where NULL is. */ + pmovmskb %xmm3, %edx +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + PUSH (%esi) + + xor %edx, %edx + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(prolog_find_zero) + + PUSH (%esi) + + mov %eax, %edx + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm3 + pcmpeqd %xmm3, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm4 + pcmpeqd %xmm4, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm4 + pmovmskb %xmm2, %ecx + pmovmskb %xmm4, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm5 + pcmpeqd %xmm5, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm5 + pmovmskb %xmm2, %ecx + pmovmskb %xmm5, %eax + or %eax, %ecx + jz L(loop) + + .p2align 4 +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %edx, %edx + jz L(return_null_1) + mov %edx, %eax + mov %esi, %edi + + POP (%esi) + + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(return_null_1): + POP (%esi) + + xor %eax, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(find_zero) +/* save match info */ + mov %eax, %edx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(find_zero): + test %cl, %cl + jz L(find_zero_in_third_or_fourth_wchar) + test $15, %cl + jz L(find_zero_in_second_wchar) + and $1, %eax + jz L(return_value) + + POP (%esi) + + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_second_wchar): + and $1 << 5 - 1, %eax + jz L(return_value) + + POP (%esi) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_third_or_fourth_wchar): + test $15, %ch + jz L(find_zero_in_fourth_wchar) + and $1 << 9 - 1, %eax + jz L(return_value) + + POP (%esi) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_fourth_wchar): + + POP (%esi) + + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(match_second_wchar): + lea -12(%edi), %eax + RETURN + + .p2align 4 +L(match_third_or_fourth_wchar): + test $15 << 4, %ah + jnz L(match_fourth_wchar) + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_third_wchar): + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_fourth_wchar): + lea -4(%edi), %eax + RETURN + + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero): + add %ecx, %edi + mov %edx, %ecx +L(prolog_find_zero_1): + test %cl, %cl + jz L(prolog_find_zero_in_third_or_fourth_wchar) + test $15, %cl + jz L(prolog_find_zero_in_second_wchar) + and $1, %eax + jz L(return_null) + + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_second_wchar): + and $1 << 5 - 1, %eax + jz L(return_null) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_third_or_fourth_wchar): + test $15, %ch + jz L(prolog_find_zero_in_fourth_wchar) + and $1 << 9 - 1, %eax + jz L(return_null) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_fourth_wchar): + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + +END (__wcsrchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S new file mode 100644 index 0000000000..cf67333995 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S @@ -0,0 +1,35 @@ +/* Multiple versions of wcsrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(wcsrchr) + .type wcsrchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcsrchr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcsrchr_sse2) +2: ret +END(wcsrchr) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c new file mode 100644 index 0000000000..75ab4b94c1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c @@ -0,0 +1,9 @@ +#include <wchar.h> + +#if IS_IN (libc) +# define WMEMCMP __wmemcmp_ia32 +#endif + +extern __typeof (wmemcmp) __wmemcmp_ia32; + +#include "wcsmbs/wmemcmp.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S new file mode 100644 index 0000000000..1a857c7e21 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse4_2 + +#include "memcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S new file mode 100644 index 0000000000..a41ef95fc1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_ssse3 + +#include "memcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S new file mode 100644 index 0000000000..1b9a54a413 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S @@ -0,0 +1,40 @@ +/* Multiple versions of wmemcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) + .text +ENTRY(wmemcmp) + .type wmemcmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wmemcmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2) +2: ret +END(wmemcmp) +#endif |