diff options
Diffstat (limited to 'REORG.TODO/sysdeps/x86_64/multiarch')
87 files changed, 24585 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/Makefile b/REORG.TODO/sysdeps/x86_64/multiarch/Makefile new file mode 100644 index 0000000000..310a3a4b72 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/Makefile @@ -0,0 +1,42 @@ +ifeq ($(subdir),csu) +tests += test-multiarch +endif + +ifeq ($(subdir),string) + +sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ + strcmp-sse2-unaligned strncmp-ssse3 \ + memcmp-avx2-movbe \ + memcmp-sse4 memcpy-ssse3 \ + memmove-ssse3 \ + memcpy-ssse3-back \ + memmove-ssse3-back \ + memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \ + strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ + strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ + strcpy-sse2-unaligned strncpy-sse2-unaligned \ + stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ + strcat-sse2-unaligned strncat-sse2-unaligned \ + strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ + strcspn-c strpbrk-c strspn-c varshift \ + memset-avx512-no-vzeroupper \ + memmove-avx-unaligned-erms \ + memmove-avx512-unaligned-erms \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms +CFLAGS-varshift.c += -msse4 +CFLAGS-strcspn-c.c += -msse4 +CFLAGS-strpbrk-c.c += -msse4 +CFLAGS-strspn-c.c += -msse4 +endif + +ifeq ($(subdir),wcsmbs) +sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wmemcmp-avx2-movbe \ + wcscpy-ssse3 wcscpy-c \ + wcsnlen-sse4_1 wcsnlen-c +endif + +ifeq ($(subdir),debug) +sysdep_routines += wmemset_chk-nonshared +endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S b/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S new file mode 100644 index 0000000000..639f02bde3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/bcopy.S @@ -0,0 +1,7 @@ +#include <sysdep.h> + + .text +ENTRY(bcopy) + xchg %rdi, %rsi + jmp __libc_memmove /* Branch to IFUNC memmove. */ +END(bcopy) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c new file mode 100644 index 0000000000..5627183aca --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -0,0 +1,460 @@ +/* Enumerate available IFUNC implementations of a function. x86-64 version. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <assert.h> +#include <string.h> +#include <wchar.h> +#include <ifunc-impl-list.h> +#include <sysdep.h> +#include "init-arch.h" + +/* Maximum number of IFUNC implementations. */ +#define MAX_IFUNC 5 + +/* Fill ARRAY of MAX elements with IFUNC implementations for function + NAME supported on target machine and return the number of valid + entries. */ + +size_t +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t max) +{ + assert (max >= MAX_IFUNC); + + size_t i = 0; + + /* Support sysdeps/x86_64/multiarch/memcmp.S. */ + IFUNC_IMPL (i, name, memcmp, + IFUNC_IMPL_ADD (array, i, memcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (MOVBE)), + __memcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1), + __memcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), + __memcmp_ssse3) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/memmove_chk.c. */ + IFUNC_IMPL (i, name, __memmove_chk, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_chk_avx_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), + __memmove_chk_ssse3_back) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), + __memmove_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_sse2_unaligned_erms)) + + /* Support sysdeps/x86_64/multiarch/memmove.S. */ + IFUNC_IMPL (i, name, memmove, + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_avx_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), + __memmove_ssse3_back) + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), + __memmove_ssse3) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) + IFUNC_IMPL_ADD (array, i, memmove, 1, + __memmove_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, 1, + __memmove_sse2_unaligned_erms)) + + /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ + IFUNC_IMPL (i, name, __memset_chk, + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_chk_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_chk_avx512_no_vzeroupper) + ) + + /* Support sysdeps/x86_64/multiarch/memset.S. */ + IFUNC_IMPL (i, name, memset, + IFUNC_IMPL_ADD (array, i, memset, 1, + __memset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memset, 1, + __memset_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX2_Usable), + __memset_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memset_avx512_no_vzeroupper) + ) + + /* Support sysdeps/x86_64/multiarch/stpncpy.S. */ + IFUNC_IMPL (i, name, stpncpy, + IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3), + __stpncpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, + __stpncpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) + + /* Support sysdeps/x86_64/multiarch/stpcpy.S. */ + IFUNC_IMPL (i, name, stpcpy, + IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3), + __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ + IFUNC_IMPL (i, name, strcasecmp, + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_ARCH_FEATURE (AVX_Usable), + __strcasecmp_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSE4_2), + __strcasecmp_sse42) + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSSE3), + __strcasecmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.S. */ + IFUNC_IMPL (i, name, strcasecmp_l, + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_ARCH_FEATURE (AVX_Usable), + __strcasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSE4_2), + __strcasecmp_l_sse42) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSSE3), + __strcasecmp_l_ssse3) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, + __strcasecmp_l_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcat.S. */ + IFUNC_IMPL (i, name, strcat, + IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3), + __strcat_ssse3) + IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2)) + + /* Support sysdeps/x86_64/multiarch/strchr.S. */ + IFUNC_IMPL (i, name, strchr, + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf) + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcmp.S. */ + IFUNC_IMPL (i, name, strcmp, + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2), + __strcmp_sse42) + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3), + __strcmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcpy.S. */ + IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3), + __strcpy_ssse3) + IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcspn.S. */ + IFUNC_IMPL (i, name, strcspn, + IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2), + __strcspn_sse42) + IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_sse2)) + + /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ + IFUNC_IMPL (i, name, strncasecmp, + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_ARCH_FEATURE (AVX_Usable), + __strncasecmp_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSE4_2), + __strncasecmp_sse42) + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSSE3), + __strncasecmp_ssse3) + IFUNC_IMPL_ADD (array, i, strncasecmp, 1, + __strncasecmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/strncase_l.S. */ + IFUNC_IMPL (i, name, strncasecmp_l, + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_ARCH_FEATURE (AVX_Usable), + __strncasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSE4_2), + __strncasecmp_l_sse42) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSSE3), + __strncasecmp_l_ssse3) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, + __strncasecmp_l_sse2)) + + /* Support sysdeps/x86_64/multiarch/strncat.S. */ + IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3), + __strncat_ssse3) + IFUNC_IMPL_ADD (array, i, strncat, 1, + __strncat_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) + + /* Support sysdeps/x86_64/multiarch/strncpy.S. */ + IFUNC_IMPL (i, name, strncpy, + IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3), + __strncpy_ssse3) + IFUNC_IMPL_ADD (array, i, strncpy, 1, + __strncpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) + + /* Support sysdeps/x86_64/multiarch/strpbrk.S. */ + IFUNC_IMPL (i, name, strpbrk, + IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2), + __strpbrk_sse42) + IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2)) + + + /* Support sysdeps/x86_64/multiarch/strspn.S. */ + IFUNC_IMPL (i, name, strspn, + IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2), + __strspn_sse42) + IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_sse2)) + + /* Support sysdeps/x86_64/multiarch/strstr.c. */ + IFUNC_IMPL (i, name, strstr, + IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcscpy.S. */ + IFUNC_IMPL (i, name, wcscpy, + IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3), + __wcscpy_ssse3) + IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ + IFUNC_IMPL (i, name, wcsnlen, + IFUNC_IMPL_ADD (array, i, wcsnlen, + HAS_CPU_FEATURE (SSE4_1), + __wcsnlen_sse4_1) + IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemcmp.S. */ + IFUNC_IMPL (i, name, wmemcmp, + IFUNC_IMPL_ADD (array, i, wmemcmp, + (HAS_ARCH_FEATURE (AVX2_Usable) + && HAS_CPU_FEATURE (MOVBE)), + __wmemcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1), + __wmemcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3), + __wmemcmp_ssse3) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemset.c. */ + IFUNC_IMPL (i, name, wmemset, + IFUNC_IMPL_ADD (array, i, wmemset, 1, + __wmemset_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_avx512_unaligned)) + +#ifdef SHARED + /* Support sysdeps/x86_64/multiarch/memcpy_chk.S. */ + IFUNC_IMPL (i, name, __memcpy_chk, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_chk_avx_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __memcpy_chk_ssse3_back) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __memcpy_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_sse2_unaligned_erms)) + + /* Support sysdeps/x86_64/multiarch/memcpy.S. */ + IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_avx_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3_back) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, 1, + __memcpy_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)) + + /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */ + IFUNC_IMPL (i, name, __mempcpy_chk, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_chk_avx_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __mempcpy_chk_ssse3_back) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __mempcpy_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_sse2_unaligned_erms)) + + /* Support sysdeps/x86_64/multiarch/mempcpy.S. */ + IFUNC_IMPL (i, name, mempcpy, + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_avx_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), + __mempcpy_ssse3_back) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), + __mempcpy_ssse3) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, + __mempcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, + __mempcpy_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)) + + /* Support sysdeps/x86_64/multiarch/strncmp.S. */ + IFUNC_IMPL (i, name, strncmp, + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2), + __strncmp_sse42) + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), + __strncmp_ssse3) + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemset_chk.c. */ + IFUNC_IMPL (i, name, __wmemset_chk, + IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1, + __wmemset_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX2_Usable), + __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __wmemset_chk_avx512_unaligned)) +#endif + + return i; +} diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h new file mode 100644 index 0000000000..d761985a47 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/ifunc-wmemset.h @@ -0,0 +1,42 @@ +/* Common definition for wmemset/wmemset_chk ifunc selections. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) + && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + return OPTIMIZE (avx512_unaligned); + else + return OPTIMIZE (avx2_unaligned); + } + + return OPTIMIZE (sse2_unaligned); +} diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S new file mode 100644 index 0000000000..47630dd97b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -0,0 +1,425 @@ +/* memcmp/wmemcmp optimized with AVX2. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +/* memcmp/wmemcmp is implemented as: + 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap + to avoid branches. + 2. Use overlapping compare to avoid branch. + 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 + bytes for wmemcmp. + 4. If size is 8 * VEC_SIZE or less, unroll the loop. + 5. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. + 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. + 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. + 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_avx2_movbe +# endif + +# ifdef USE_AS_WMEMCMP +# define VPCMPEQ vpcmpeqd +# else +# define VPCMPEQ vpcmpeqb +# endif + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 +# define VEC_MASK ((1 << VEC_SIZE) - 1) + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.avx,"ax",@progbits +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx +# endif + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + +L(last_2x_vec): + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + +L(last_vec): + /* Use overlapping loads to avoid branches. */ + leaq -VEC_SIZE(%rdi, %rdx), %rdi + leaq -VEC_SIZE(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + VZEROUPPER + ret + + .p2align 4 +L(first_vec): + /* A byte or int32 is different within 16 or 32 bytes. */ + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (%rdi, %rcx), %edx + cmpl (%rsi, %rcx), %edx +L(wmemcmp_return): + setl %al + negl %eax + orl $1, %eax +# else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + +# ifdef USE_AS_WMEMCMP + .p2align 4 +L(4): + xorl %eax, %eax + movl (%rdi), %edx + cmpl (%rsi), %edx + jne L(wmemcmp_return) + ret +# else + .p2align 4 +L(between_4_7): + /* Load as big endian with overlapping movbe to avoid branches. */ + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + je L(exit) + sbbl %eax, %eax + orl $1, %eax + ret + + .p2align 4 +L(exit): + ret + + .p2align 4 +L(between_2_3): + /* Load as big endian with overlapping loads and bswap to avoid + branches. */ + movzwl -2(%rdi, %rdx), %eax + movzwl -2(%rsi, %rdx), %ecx + shll $16, %eax + shll $16, %ecx + movzwl (%rdi), %edi + movzwl (%rsi), %esi + orl %edi, %eax + orl %esi, %ecx + bswap %eax + bswap %ecx + subl %ecx, %eax + ret + + .p2align 4 +L(1): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + subl %ecx, %eax + ret +# endif + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + .p2align 4 +L(less_vec): +# ifdef USE_AS_WMEMCMP + /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ + cmpb $4, %dl + je L(4) + jb L(zero) +# else + cmpb $1, %dl + je L(1) + jb L(zero) + cmpb $4, %dl + jb L(between_2_3) + cmpb $8, %dl + jb L(between_4_7) +# endif + cmpb $16, %dl + jae L(between_16_31) + /* It is between 8 and 15 bytes. */ + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + /* Use overlapping loads to avoid branches. */ + leaq -8(%rdi, %rdx), %rdi + leaq -8(%rsi, %rdx), %rsi + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + vmovdqu (%rsi), %xmm2 + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -16(%rdi, %rdx), %rdi + leaq -16(%rsi, %rdx), %rsi + vmovdqu (%rsi), %xmm2 + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax + jnz L(first_vec) + ret + + .p2align 4 +L(more_2x_vec): + /* More than 2 * VEC. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + + /* From 4 * VEC to 8 * VEC, inclusively. */ + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + + vpand %ymm1, %ymm2, %ymm5 + vpand %ymm3, %ymm4, %ymm6 + vpand %ymm5, %ymm6, %ymm5 + + vpmovmskb %ymm5, %eax + subl $VEC_MASK, %eax + jnz L(4x_vec_end) + + leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + vpand %ymm2, %ymm1, %ymm5 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vpand %ymm3, %ymm5, %ymm5 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + vpand %ymm4, %ymm5, %ymm5 + + vpmovmskb %ymm5, %eax + subl $VEC_MASK, %eax + jnz L(4x_vec_end) + VZEROUPPER + ret + + .p2align 4 +L(more_8x_vec): + /* More than 8 * VEC. Check the first VEC. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Align the first memory area for aligned loads in the loop. + Compute how much the first memory area is misaligned. */ + movq %rdi, %rcx + andl $(VEC_SIZE - 1), %ecx + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %rcx + /* Adjust the second memory area. */ + subq %rcx, %rsi + /* Adjust the first memory area which should be aligned now. */ + subq %rcx, %rdi + /* Adjust length. */ + addq %rcx, %rdx + +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + vmovdqu (%rsi), %ymm1 + VPCMPEQ (%rdi), %ymm1, %ymm1 + + vmovdqu VEC_SIZE(%rsi), %ymm2 + VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 + vpand %ymm2, %ymm1, %ymm5 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 + VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vpand %ymm3, %ymm5, %ymm5 + + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 + VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + vpand %ymm4, %ymm5, %ymm5 + + vpmovmskb %ymm5, %eax + subl $VEC_MASK, %eax + jnz L(4x_vec_end) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rsi + + subq $(VEC_SIZE * 4), %rdx + cmpq $(VEC_SIZE * 4), %rdx + jae L(loop_4x_vec) + + /* Less than 4 * VEC. */ + cmpq $VEC_SIZE, %rdx + jbe L(last_vec) + cmpq $(VEC_SIZE * 2), %rdx + jbe L(last_2x_vec) + +L(last_4x_vec): + /* From 2 * VEC to 4 * VEC. */ + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + /* Use overlapping loads to avoid branches. */ + leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rsi + vmovdqu (%rsi), %ymm2 + VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + VZEROUPPER + ret + + .p2align 4 +L(4x_vec_end): + vpmovmskb %ymm1, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x1) + vpmovmskb %ymm3, %eax + subl $VEC_MASK, %eax + jnz L(first_vec_x2) + vpmovmskb %ymm4, %eax + subl $VEC_MASK, %eax + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 3)(%rdi, %rcx), %edx + cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x1): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl VEC_SIZE(%rdi, %rcx), %edx + cmpl VEC_SIZE(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret + + .p2align 4 +L(first_vec_x2): + tzcntl %eax, %ecx +# ifdef USE_AS_WMEMCMP + xorl %eax, %eax + movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx + jmp L(wmemcmp_return) +# else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx + sub %edx, %eax +# endif + VZEROUPPER + ret +END (MEMCMP) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S new file mode 100644 index 0000000000..771639f662 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -0,0 +1,1776 @@ +/* memcmp with SSE4.1, wmemcmp with SSE4.1 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_1 +# endif + +# define JMPTBL(I, B) (I - B) + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + add %r11, %rcx; \ + jmp *%rcx; \ + ud2 + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.sse4.1,"ax",@progbits +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx +# endif + pxor %xmm0, %xmm0 + cmp $79, %rdx + ja L(79bytesormore) +# ifndef USE_AS_WMEMCMP + cmp $1, %rdx + je L(firstbyte) +# endif + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(firstbyte): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + sub %ecx, %eax + ret +# endif + + .p2align 4 +L(79bytesormore): + movdqu (%rsi), %xmm1 + movdqu (%rdi), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + mov %rsi, %rcx + and $-16, %rsi + add $16, %rsi + sub %rsi, %rcx + + sub %rcx, %rdi + add %rcx, %rdx + test $0xf, %rdi + jz L(2aligned) + + cmp $128, %rdx + ja L(128bytesormore) +L(less128bytes): + sub $64, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(128bytesormore): + cmp $512, %rdx + ja L(512bytesormore) + cmp $256, %rdx + ja L(less512bytes) +L(less256bytes): + sub $128, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin128) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(less512bytes): + sub $256, %rdx + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqu 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqu 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqu 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqu 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqu 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqu 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqu 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqu 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytes) + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin256) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + .p2align 4 +L(512bytesormore): +# ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %R8_LP +# else + mov __x86_data_cache_size_half(%rip), %R8_LP +# endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_unaglined) + sub $64, %rdx + .p2align 4 +L(64bytesormore_loop): + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(L2_L3_cache_unaglined): + sub $64, %rdx + .p2align 4 +L(L2_L3_unaligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_unaligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +/* + * This case is for machines which are sensitive for unaligned instructions. + */ + .p2align 4 +L(2aligned): + cmp $128, %rdx + ja L(128bytesormorein2aligned) +L(less128bytesin2aligned): + sub $64, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64in2alinged) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64in2alinged): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + .p2align 4 +L(128bytesormorein2aligned): + cmp $512, %rdx + ja L(512bytesormorein2aligned) + cmp $256, %rdx + ja L(256bytesormorein2aligned) +L(less256bytesin2alinged): + sub $128, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin128in2aligned) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128in2aligned): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + .p2align 4 +L(256bytesormorein2aligned): + + sub $256, %rdx + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqa 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqa 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqa 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqa 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqa 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqa 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqa 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqa 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytesin2alinged) + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin256in2alinged) + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256in2alinged): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + .p2align 4 +L(512bytesormorein2aligned): +# ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %R8_LP +# else + mov __x86_data_cache_size_half(%rip), %R8_LP +# endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_aglined) + + sub $64, %rdx + .p2align 4 +L(64bytesormore_loopin2aligned): + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loopin2aligned) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +L(L2_L3_cache_aglined): + sub $64, %rdx + + .p2align 4 +L(L2_L3_aligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_aligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + + .p2align 4 +L(64bytesormore_loop_end): + add $16, %rdi + add $16, %rsi + ptest %xmm2, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm3, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm4, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + jmp L(16bytes) + +L(256bytesin256): + add $256, %rdi + add $256, %rsi + jmp L(16bytes) +L(240bytesin256): + add $240, %rdi + add $240, %rsi + jmp L(16bytes) +L(224bytesin256): + add $224, %rdi + add $224, %rsi + jmp L(16bytes) +L(208bytesin256): + add $208, %rdi + add $208, %rsi + jmp L(16bytes) +L(192bytesin256): + add $192, %rdi + add $192, %rsi + jmp L(16bytes) +L(176bytesin256): + add $176, %rdi + add $176, %rsi + jmp L(16bytes) +L(160bytesin256): + add $160, %rdi + add $160, %rsi + jmp L(16bytes) +L(144bytesin256): + add $144, %rdi + add $144, %rsi + jmp L(16bytes) +L(128bytesin256): + add $128, %rdi + add $128, %rsi + jmp L(16bytes) +L(112bytesin256): + add $112, %rdi + add $112, %rsi + jmp L(16bytes) +L(96bytesin256): + add $96, %rdi + add $96, %rsi + jmp L(16bytes) +L(80bytesin256): + add $80, %rdi + add $80, %rsi + jmp L(16bytes) +L(64bytesin256): + add $64, %rdi + add $64, %rsi + jmp L(16bytes) +L(48bytesin256): + add $16, %rdi + add $16, %rsi +L(32bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytes): + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(8bytes): + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(12bytes): + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(4bytes): + mov -4(%rsi), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax + cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif + jne L(diffin4bytes) +L(0bytes): + xor %eax, %eax + ret + +# ifndef USE_AS_WMEMCMP +/* unreal case for wmemcmp */ + .p2align 4 +L(65bytes): + movdqu -65(%rdi), %xmm1 + movdqu -65(%rsi), %xmm2 + mov $-65, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(49bytes): + movdqu -49(%rdi), %xmm1 + movdqu -49(%rsi), %xmm2 + mov $-49, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%rdi), %xmm1 + movdqu -33(%rsi), %xmm2 + mov $-33, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%rdi), %rax + mov -17(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(9bytes): + mov -9(%rdi), %rax + mov -9(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(13bytes): + mov -13(%rdi), %rax + mov -13(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(5bytes): + mov -5(%rdi), %eax + mov -5(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(66bytes): + movdqu -66(%rdi), %xmm1 + movdqu -66(%rsi), %xmm2 + mov $-66, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(50bytes): + movdqu -50(%rdi), %xmm1 + movdqu -50(%rsi), %xmm2 + mov $-50, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + movdqu -34(%rdi), %xmm1 + movdqu -34(%rsi), %xmm2 + mov $-34, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%rdi), %rax + mov -18(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(10bytes): + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + .p2align 4 +L(14bytes): + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(6bytes): + mov -6(%rdi), %eax + mov -6(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) +L(2bytes): + movzwl -2(%rsi), %ecx + movzwl -2(%rdi), %eax + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + .p2align 4 +L(67bytes): + movdqu -67(%rdi), %xmm2 + movdqu -67(%rsi), %xmm1 + mov $-67, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(51bytes): + movdqu -51(%rdi), %xmm2 + movdqu -51(%rsi), %xmm1 + mov $-51, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + movdqu -35(%rsi), %xmm1 + movdqu -35(%rdi), %xmm2 + mov $-35, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + mov -19(%rdi), %rax + mov -19(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(11bytes): + mov -11(%rdi), %rax + mov -11(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + .p2align 4 +L(15bytes): + mov -15(%rdi), %rax + mov -15(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(7bytes): + mov -7(%rdi), %eax + mov -7(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + .p2align 4 +L(3bytes): + movzwl -3(%rdi), %eax + movzwl -3(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin2bytes) +L(1bytes): + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %ecx + sub %ecx, %eax + ret +# endif + + .p2align 4 +L(68bytes): + movdqu -68(%rdi), %xmm2 + movdqu -68(%rsi), %xmm1 + mov $-68, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(52bytes): + movdqu -52(%rdi), %xmm2 + movdqu -52(%rsi), %xmm1 + mov $-52, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%rdi), %xmm2 + movdqu -36(%rsi), %xmm1 + mov $-36, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%rdi), %xmm2 + movdqu -20(%rsi), %xmm1 + mov $-20, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%rsi), %ecx + +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax + cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif + jne L(diffin4bytes) + xor %eax, %eax + ret + +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ + .p2align 4 +L(69bytes): + movdqu -69(%rsi), %xmm1 + movdqu -69(%rdi), %xmm2 + mov $-69, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(53bytes): + movdqu -53(%rsi), %xmm1 + movdqu -53(%rdi), %xmm2 + mov $-53, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + movdqu -37(%rsi), %xmm1 + movdqu -37(%rdi), %xmm2 + mov $-37, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + movdqu -21(%rsi), %xmm1 + movdqu -21(%rdi), %xmm2 + mov $-21, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(70bytes): + movdqu -70(%rsi), %xmm1 + movdqu -70(%rdi), %xmm2 + mov $-70, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(54bytes): + movdqu -54(%rsi), %xmm1 + movdqu -54(%rdi), %xmm2 + mov $-54, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + movdqu -38(%rsi), %xmm1 + movdqu -38(%rdi), %xmm2 + mov $-38, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + movdqu -22(%rsi), %xmm1 + movdqu -22(%rdi), %xmm2 + mov $-22, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(71bytes): + movdqu -71(%rsi), %xmm1 + movdqu -71(%rdi), %xmm2 + mov $-71, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(55bytes): + movdqu -55(%rdi), %xmm2 + movdqu -55(%rsi), %xmm1 + mov $-55, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + movdqu -39(%rdi), %xmm2 + movdqu -39(%rsi), %xmm1 + mov $-39, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + movdqu -23(%rdi), %xmm2 + movdqu -23(%rsi), %xmm1 + mov $-23, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret +# endif + + .p2align 4 +L(72bytes): + movdqu -72(%rsi), %xmm1 + movdqu -72(%rdi), %xmm2 + mov $-72, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(56bytes): + movdqu -56(%rdi), %xmm2 + movdqu -56(%rsi), %xmm1 + mov $-56, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + movdqu -40(%rdi), %xmm2 + movdqu -40(%rsi), %xmm1 + mov $-40, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + movdqu -24(%rdi), %xmm2 + movdqu -24(%rsi), %xmm1 + mov $-24, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -8(%rsi), %rcx + mov -8(%rdi), %rax + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ + .p2align 4 +L(73bytes): + movdqu -73(%rsi), %xmm1 + movdqu -73(%rdi), %xmm2 + mov $-73, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(57bytes): + movdqu -57(%rdi), %xmm2 + movdqu -57(%rsi), %xmm1 + mov $-57, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + movdqu -41(%rdi), %xmm2 + movdqu -41(%rsi), %xmm1 + mov $-41, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + movdqu -25(%rdi), %xmm2 + movdqu -25(%rsi), %xmm1 + mov $-25, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%rdi), %rax + mov -9(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %ecx + sub %ecx, %eax + ret + + .p2align 4 +L(74bytes): + movdqu -74(%rsi), %xmm1 + movdqu -74(%rdi), %xmm2 + mov $-74, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(58bytes): + movdqu -58(%rdi), %xmm2 + movdqu -58(%rsi), %xmm1 + mov $-58, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + movdqu -42(%rdi), %xmm2 + movdqu -42(%rsi), %xmm1 + mov $-42, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + movdqu -26(%rdi), %xmm2 + movdqu -26(%rsi), %xmm1 + mov $-26, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + jmp L(diffin2bytes) + + .p2align 4 +L(75bytes): + movdqu -75(%rsi), %xmm1 + movdqu -75(%rdi), %xmm2 + mov $-75, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(59bytes): + movdqu -59(%rdi), %xmm2 + movdqu -59(%rsi), %xmm1 + mov $-59, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + movdqu -43(%rdi), %xmm2 + movdqu -43(%rsi), %xmm1 + mov $-43, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + movdqu -27(%rdi), %xmm2 + movdqu -27(%rsi), %xmm1 + mov $-27, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -11(%rdi), %rax + mov -11(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret +# endif + .p2align 4 +L(76bytes): + movdqu -76(%rsi), %xmm1 + movdqu -76(%rdi), %xmm2 + mov $-76, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(60bytes): + movdqu -60(%rdi), %xmm2 + movdqu -60(%rsi), %xmm1 + mov $-60, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + movdqu -44(%rdi), %xmm2 + movdqu -44(%rsi), %xmm1 + mov $-44, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + movdqu -28(%rdi), %xmm2 + movdqu -28(%rsi), %xmm1 + mov $-28, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rsi), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%rdi), %eax + cmp %eax, %ecx +# else + cmp -4(%rdi), %ecx +# endif + jne L(diffin4bytes) + xor %eax, %eax + ret + +# ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ + .p2align 4 +L(77bytes): + movdqu -77(%rsi), %xmm1 + movdqu -77(%rdi), %xmm2 + mov $-77, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(61bytes): + movdqu -61(%rdi), %xmm2 + movdqu -61(%rsi), %xmm1 + mov $-61, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + movdqu -45(%rdi), %xmm2 + movdqu -45(%rsi), %xmm1 + mov $-45, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + movdqu -29(%rdi), %xmm2 + movdqu -29(%rsi), %xmm1 + mov $-29, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%rdi), %rax + mov -13(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(78bytes): + movdqu -78(%rsi), %xmm1 + movdqu -78(%rdi), %xmm2 + mov $-78, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(62bytes): + movdqu -62(%rdi), %xmm2 + movdqu -62(%rsi), %xmm1 + mov $-62, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + movdqu -46(%rdi), %xmm2 + movdqu -46(%rsi), %xmm1 + mov $-46, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + movdqu -30(%rdi), %xmm2 + movdqu -30(%rsi), %xmm1 + mov $-30, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + .p2align 4 +L(79bytes): + movdqu -79(%rsi), %xmm1 + movdqu -79(%rdi), %xmm2 + mov $-79, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(63bytes): + movdqu -63(%rdi), %xmm2 + movdqu -63(%rsi), %xmm1 + mov $-63, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + movdqu -47(%rdi), %xmm2 + movdqu -47(%rsi), %xmm1 + mov $-47, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + movdqu -31(%rdi), %xmm2 + movdqu -31(%rsi), %xmm1 + mov $-31, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -15(%rdi), %rax + mov -15(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret +# endif + .p2align 4 +L(64bytes): + movdqu -64(%rdi), %xmm2 + movdqu -64(%rsi), %xmm1 + mov $-64, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%rdi), %xmm2 + movdqu -48(%rsi), %xmm1 + mov $-48, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%rdi), %xmm2 + movdqu -32(%rsi), %xmm1 + mov $-32, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + +/* + * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. + */ + .p2align 3 +L(less16bytes): + movsbq %dl, %rdx + mov (%rsi, %rdx), %rcx + mov (%rdi, %rdx), %rax + cmp %rax, %rcx + jne L(diffin8bytes) + mov 8(%rsi, %rdx), %rcx + mov 8(%rdi, %rdx), %rax +L(diffin8bytes): + cmp %eax, %ecx + jne L(diffin4bytes) + shr $32, %rcx + shr $32, %rax + +# ifdef USE_AS_WMEMCMP +/* for wmemcmp */ + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret +# endif + +L(diffin4bytes): +# ifndef USE_AS_WMEMCMP + cmp %cx, %ax + jne L(diffin2bytes) + shr $16, %ecx + shr $16, %eax +L(diffin2bytes): + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + .p2align 4 +L(end): + and $0xff, %eax + and $0xff, %ecx + sub %ecx, %eax + ret +# else + +/* for wmemcmp */ + mov $1, %eax + jl L(nequal_bigger) + neg %eax + ret + + .p2align 4 +L(nequal_bigger): + ret + +L(unreal_case): + xor %eax, %eax + ret +# endif + +END (MEMCMP) + + .section .rodata.sse4.1,"a",@progbits + .p2align 3 +# ifndef USE_AS_WMEMCMP +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(65bytes), L(table_64bytes)) + .int JMPTBL (L(66bytes), L(table_64bytes)) + .int JMPTBL (L(67bytes), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(69bytes), L(table_64bytes)) + .int JMPTBL (L(70bytes), L(table_64bytes)) + .int JMPTBL (L(71bytes), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(73bytes), L(table_64bytes)) + .int JMPTBL (L(74bytes), L(table_64bytes)) + .int JMPTBL (L(75bytes), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(77bytes), L(table_64bytes)) + .int JMPTBL (L(78bytes), L(table_64bytes)) + .int JMPTBL (L(79bytes), L(table_64bytes)) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000000..8d7d2fe67b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp-ssse3.S @@ -0,0 +1,1990 @@ +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + atom_text_section +ENTRY (MEMCMP) +# ifdef USE_AS_WMEMCMP + shl $2, %rdx + test %rdx, %rdx + jz L(equal) +# endif + mov %rdx, %rcx + mov %rdi, %rdx + cmp $48, %rcx; + jae L(48bytesormore) /* LEN => 48 */ + + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +/* ECX >= 32. */ +L(48bytesormore): + movdqu (%rdi), %xmm3 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%rdi), %rdi + lea 16(%rsi), %rsi + sub $0xffff, %edx + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %rdx, %rdi + sub %rdx, %rsi + add %rdx, %rcx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %rdx, %rsi + +# ifndef USE_AS_WMEMCMP + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + .p2align 2 +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif + + .p2align 4 +L(shr_0): + cmp $80, %rcx + lea -48(%rcx), %rcx + jae L(shr_0_gobble) + xor %eax, %eax + movdqa (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + movdqa 16(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_0_gobble): + movdqa (%rsi), %xmm0 + xor %eax, %eax + pcmpeqb (%rdi), %xmm0 + sub $32, %rcx + movdqa 16(%rsi), %xmm2 + pcmpeqb 16(%rdi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %rcx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%rsi), %xmm0 + movdqa 48(%rsi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%rdi), %xmm0 + pcmpeqb 48(%rdi), %xmm2 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %rcx + jge L(next) + inc %edx + add $32, %rcx +L(next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(shr_1): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $1, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $1, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_1_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $1, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $1, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $1, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $1, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_1_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_1_gobble_next) + inc %edx + add $32, %rcx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 1(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + + .p2align 4 +L(shr_2): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $2, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $2, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_2_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $2, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $2, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $2, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $2, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_2_gobble_next) + inc %edx + add $32, %rcx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 2(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_3): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $3, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $3, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_3_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $3, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $3, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $3, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $3, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_3_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_3_gobble_next) + inc %edx + add $32, %rcx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 3(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + .p2align 4 +L(shr_4): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $4, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $4, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_4_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $4, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $4, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $4, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $4, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_4_gobble_next) + inc %edx + add $32, %rcx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 4(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(shr_5): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $5, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $5, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_5_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $5, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $5, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $5, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $5, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_5_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_5_gobble_next) + inc %edx + add $32, %rcx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 5(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_6): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $6, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $6, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_6_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $6, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $6, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $6, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $6, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_6_gobble_next) + inc %edx + add $32, %rcx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 6(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_7): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $7, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $7, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_7_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $7, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $7, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $7, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $7, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_7_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_7_gobble_next) + inc %edx + add $32, %rcx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 7(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + .p2align 4 +L(shr_8): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $8, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $8, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_8_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $8, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $8, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $8, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $8, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_8_gobble_next) + inc %edx + add $32, %rcx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 8(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(shr_9): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $9, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $9, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_9_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $9, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $9, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $9, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $9, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_9_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_9_gobble_next) + inc %edx + add $32, %rcx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 9(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_10): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $10, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $10, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_10_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $10, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $10, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $10, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $10, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_10_gobble_next) + inc %edx + add $32, %rcx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 10(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_11): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $11, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_11_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $11, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $11, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $11, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $11, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_11_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_11_gobble_next) + inc %edx + add $32, %rcx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 11(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# endif + + .p2align 4 +L(shr_12): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $12, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_12_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $12, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $12, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $12, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $12, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_12_gobble_next) + inc %edx + add $32, %rcx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 12(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(shr_13): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $13, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_13_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $13, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $13, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $13, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $13, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_13_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_13_gobble_next) + inc %edx + add $32, %rcx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 13(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_14): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $14, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_14_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $14, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $14, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $14, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $14, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_14_gobble_next) + inc %edx + add $32, %rcx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 14(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_15): + cmp $80, %rcx + lea -48(%rcx), %rcx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%rsi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%rsi), %xmm1 + pcmpeqb (%rdi), %xmm1 + + movdqa 32(%rsi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%rdi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + add $15, %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) + + .p2align 4 +L(shr_15_gobble): + sub $32, %rcx + movdqa 16(%rsi), %xmm0 + palignr $15, (%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + + movdqa 32(%rsi), %xmm3 + palignr $15, 16(%rsi), %xmm3 + pcmpeqb 16(%rdi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %rcx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%rsi), %xmm3 + palignr $15, 48(%rsi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%rsi), %xmm0 + palignr $15, 32(%rsi), %xmm0 + pcmpeqb 32(%rdi), %xmm0 + lea 32(%rsi), %rsi + pcmpeqb 48(%rdi), %xmm3 + + lea 32(%rdi), %rdi + jz L(shr_15_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %rcx + jge L(shr_15_gobble_next) + inc %edx + add $32, %rcx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%rdi), %rdi + lea 32(%rsi), %rsi + sub $0xffff, %edx + jnz L(exit) + + lea 15(%rsi), %rsi + add %rcx, %rsi + add %rcx, %rdi + jmp L(less48bytes) +# endif + .p2align 4 +L(exit): + pmovmskb %xmm1, %r8d + sub $0xffff, %r8d + jz L(first16bytes) + lea -16(%rsi), %rsi + lea -16(%rdi), %rdi + mov %r8d, %edx +L(first16bytes): + add %rax, %rsi +L(less16bytes): +# ifndef USE_AS_WMEMCMP + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) + + movzbl -9(%rdi), %eax + movzbl -9(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte16): + movzbl -16(%rdi), %eax + movzbl -16(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte17): + movzbl -15(%rdi), %eax + movzbl -15(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte18): + movzbl -14(%rdi), %eax + movzbl -14(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte19): + movzbl -13(%rdi), %eax + movzbl -13(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte20): + movzbl -12(%rdi), %eax + movzbl -12(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte21): + movzbl -11(%rdi), %eax + movzbl -11(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(Byte22): + movzbl -10(%rdi), %eax + movzbl -10(%rsi), %edx + sub %edx, %eax + ret + + .p2align 4 +L(next_24_bytes): + lea 8(%rdi), %rdi + lea 8(%rsi), %rsi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + movzbl -9(%rdi), %eax + movzbl -9(%rsi), %edx + sub %edx, %eax + ret +# else +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%rdi), %eax + cmp -16(%rsi), %eax + jne L(find_diff) + ret + + .p2align 4 +L(second_double_word): + mov -12(%rdi), %eax + cmp -12(%rsi), %eax + jne L(find_diff) + ret + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%rdi), %eax + cmp -8(%rsi), %eax + jne L(find_diff) + ret + + .p2align 4 +L(fourth_double_word): + mov -4(%rdi), %eax + cmp -4(%rsi), %eax + jne L(find_diff) + ret +# endif + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) + cmp $0, %ecx + je L(0bytes) +# ifndef USE_AS_WMEMCMP + cmp $1, %ecx + je L(1bytes) + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif + + .p2align 4 +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) +# ifndef USE_AS_WMEMCMP + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) +# else + jmp L(12bytes) +# endif + + .p2align 4 +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) +# ifndef USE_AS_WMEMCMP + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) +# else + jmp L(20bytes) +# endif + + .p2align 4 +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) +# ifndef USE_AS_WMEMCMP + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) +# else + jmp L(28bytes) +# endif + + .p2align 4 +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) +# ifndef USE_AS_WMEMCMP + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + .p2align 4 +L(more40bytes): + cmp $40, %ecx + je L(40bytes) +# ifndef USE_AS_WMEMCMP + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + .p2align 4 +L(44bytes): + movl -44(%rdi), %eax + movl -44(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(40bytes): + movl -40(%rdi), %eax + movl -40(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(36bytes): + movl -36(%rdi), %eax + movl -36(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(32bytes): + movl -32(%rdi), %eax + movl -32(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(28bytes): + movl -28(%rdi), %eax + movl -28(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(24bytes): + movl -24(%rdi), %eax + movl -24(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(20bytes): + movl -20(%rdi), %eax + movl -20(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(16bytes): + movl -16(%rdi), %eax + movl -16(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(12bytes): + movl -12(%rdi), %eax + movl -12(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(8bytes): + movl -8(%rdi), %eax + movl -8(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(4bytes): + movl -4(%rdi), %eax + movl -4(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(0bytes): + xor %eax, %eax + ret +# else + .p2align 4 +L(44bytes): + movl -44(%rdi), %eax + cmp -44(%rsi), %eax + jne L(find_diff) +L(40bytes): + movl -40(%rdi), %eax + cmp -40(%rsi), %eax + jne L(find_diff) +L(36bytes): + movl -36(%rdi), %eax + cmp -36(%rsi), %eax + jne L(find_diff) +L(32bytes): + movl -32(%rdi), %eax + cmp -32(%rsi), %eax + jne L(find_diff) +L(28bytes): + movl -28(%rdi), %eax + cmp -28(%rsi), %eax + jne L(find_diff) +L(24bytes): + movl -24(%rdi), %eax + cmp -24(%rsi), %eax + jne L(find_diff) +L(20bytes): + movl -20(%rdi), %eax + cmp -20(%rsi), %eax + jne L(find_diff) +L(16bytes): + movl -16(%rdi), %eax + cmp -16(%rsi), %eax + jne L(find_diff) +L(12bytes): + movl -12(%rdi), %eax + cmp -12(%rsi), %eax + jne L(find_diff) +L(8bytes): + movl -8(%rdi), %eax + cmp -8(%rsi), %eax + jne L(find_diff) +L(4bytes): + movl -4(%rdi), %eax + cmp -4(%rsi), %eax + jne L(find_diff) +L(0bytes): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(45bytes): + movl -45(%rdi), %eax + movl -45(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(41bytes): + movl -41(%rdi), %eax + movl -41(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(37bytes): + movl -37(%rdi), %eax + movl -37(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(33bytes): + movl -33(%rdi), %eax + movl -33(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(29bytes): + movl -29(%rdi), %eax + movl -29(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(25bytes): + movl -25(%rdi), %eax + movl -25(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(21bytes): + movl -21(%rdi), %eax + movl -21(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(17bytes): + movl -17(%rdi), %eax + movl -17(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(13bytes): + movl -13(%rdi), %eax + movl -13(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(9bytes): + movl -9(%rdi), %eax + movl -9(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(5bytes): + movl -5(%rdi), %eax + movl -5(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(1bytes): + movzbl -1(%rdi), %eax + cmpb -1(%rsi), %al + jne L(set) + xor %eax, %eax + ret + + .p2align 4 +L(46bytes): + movl -46(%rdi), %eax + movl -46(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(42bytes): + movl -42(%rdi), %eax + movl -42(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(38bytes): + movl -38(%rdi), %eax + movl -38(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(34bytes): + movl -34(%rdi), %eax + movl -34(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(30bytes): + movl -30(%rdi), %eax + movl -30(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(26bytes): + movl -26(%rdi), %eax + movl -26(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(22bytes): + movl -22(%rdi), %eax + movl -22(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(18bytes): + movl -18(%rdi), %eax + movl -18(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(14bytes): + movl -14(%rdi), %eax + movl -14(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(10bytes): + movl -10(%rdi), %eax + movl -10(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(6bytes): + movl -6(%rdi), %eax + movl -6(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(2bytes): + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmpb %cl, %al + jne L(set) + cmp %ecx, %eax + jne L(set) + xor %eax, %eax + ret + + .p2align 4 +L(47bytes): + movl -47(%rdi), %eax + movl -47(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(43bytes): + movl -43(%rdi), %eax + movl -43(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(39bytes): + movl -39(%rdi), %eax + movl -39(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(35bytes): + movl -35(%rdi), %eax + movl -35(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(31bytes): + movl -31(%rdi), %eax + movl -31(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(27bytes): + movl -27(%rdi), %eax + movl -27(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(23bytes): + movl -23(%rdi), %eax + movl -23(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(19bytes): + movl -19(%rdi), %eax + movl -19(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(15bytes): + movl -15(%rdi), %eax + movl -15(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(11bytes): + movl -11(%rdi), %eax + movl -11(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(7bytes): + movl -7(%rdi), %eax + movl -7(%rsi), %ecx + cmp %ecx, %eax + jne L(find_diff) +L(3bytes): + movzwl -3(%rdi), %eax + movzwl -3(%rsi), %ecx + cmpb %cl, %al + jne L(set) + cmp %ecx, %eax + jne L(set) + movzbl -1(%rdi), %eax + cmpb -1(%rsi), %al + jne L(set) + xor %eax, %eax + ret + + .p2align 4 +L(find_diff): + cmpb %cl, %al + jne L(set) + cmpw %cx, %ax + jne L(set) + shr $16, %eax + shr $16, %ecx + cmpb %cl, %al + jne L(set) + +/* We get there only if we already know there is a +difference. */ + + cmp %ecx, %eax +L(set): + sbb %eax, %eax + sbb $-1, %eax + ret +# else + +/* for wmemcmp */ + .p2align 4 +L(find_diff): + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret +# endif + + .p2align 4 +L(equal): + xor %eax, %eax + ret + +END (MEMCMP) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S new file mode 100644 index 0000000000..0c9804b7e9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcmp.S @@ -0,0 +1,78 @@ +/* Multiple versions of memcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 1f + HAS_ARCH_FEATURE (AVX2_Usable) + jz 1f + HAS_CPU_FEATURE (MOVBE) + jz 1f + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz 1f + leaq __memcmp_avx2_movbe(%rip), %rax + ret + +1: HAS_CPU_FEATURE (SSSE3) + jnz 2f + leaq __memcmp_sse2(%rip), %rax + ret + +2: HAS_CPU_FEATURE (SSE4_1) + jz 3f + leaq __memcmp_sse4_1(%rip), %rax + ret + +3: leaq __memcmp_ssse3(%rip), %rax + ret + +END(memcmp) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcmp_sse2, @function; \ + .p2align 4; \ + .globl __memcmp_sse2; \ + .hidden __memcmp_sse2; \ + __memcmp_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal memcmp calls through a PLT. + The speedup we get from using SSE4 instructions is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2 +# endif +#endif + +#include "../memcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S new file mode 100644 index 0000000000..4e060a27fd --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S @@ -0,0 +1,3180 @@ +/* memcpy with SSSE3 and REP string + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3_back +# define MEMCPY_CHK __memcpy_chk_ssse3_back +# define MEMPCPY __mempcpy_ssse3_back +# define MEMPCPY_CHK __mempcpy_chk_ssse3_back +#endif + +#define JMPTBL(I, B) I - B + +/* Branch to an entry in a jump table. TABLE is a jump table with + relative offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), INDEX; \ + lea (%r11, INDEX), INDEX; \ + jmp *INDEX; \ + ud2 + + .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE +ENTRY (MEMPCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMPCPY_CHK) + +ENTRY (MEMPCPY) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMPCPY) +#endif + +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif + +ENTRY (MEMCPY) + mov %rdi, %rax +#ifdef USE_AS_MEMPCPY + add %rdx, %rax +#endif + +#ifdef USE_AS_MEMMOVE + cmp %rsi, %rdi + jb L(copy_forward) + je L(bwd_write_0bytes) + cmp $144, %rdx + jae L(copy_backward) + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) +L(copy_forward): +#endif +L(start): + cmp $144, %rdx + jae L(144bytesormore) + +L(fwd_write_less32bytes): +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jbe L(bk_write) +#endif + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) +#ifndef USE_AS_MEMMOVE +L(bk_write): + + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) +#endif + + .p2align 4 +L(144bytesormore): + +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jle L(copy_backward) +#endif + movdqu (%rsi), %xmm0 + mov %rdi, %r8 + and $-16, %rdi + add $16, %rdi + mov %rdi, %r9 + sub %r8, %r9 + sub %r9, %rdx + add %r9, %rsi + mov %rsi, %r9 + and $0xf, %r9 + jz L(shl_0) +#ifdef DATA_CACHE_SIZE + mov $DATA_CACHE_SIZE, %RCX_LP +#else + mov __x86_data_cache_size(%rip), %RCX_LP +#endif + cmp %rcx, %rdx + jae L(gobble_mem_fwd) + lea L(shl_table_fwd)(%rip), %r11 + sub $0x80, %rdx + movslq (%r11, %r9, 4), %r9 + add %r11, %r9 + jmp *%r9 + ud2 + + .p2align 4 +L(copy_backward): +#ifdef DATA_CACHE_SIZE + mov $DATA_CACHE_SIZE, %RCX_LP +#else + mov __x86_data_cache_size(%rip), %RCX_LP +#endif + shl $1, %rcx + cmp %rcx, %rdx + ja L(gobble_mem_bwd) + + add %rdx, %rdi + add %rdx, %rsi + movdqu -16(%rsi), %xmm0 + lea -16(%rdi), %r8 + mov %rdi, %r9 + and $0xf, %r9 + xor %r9, %rdi + sub %r9, %rsi + sub %r9, %rdx + mov %rsi, %r9 + and $0xf, %r9 + jz L(shl_0_bwd) + lea L(shl_table_bwd)(%rip), %r11 + sub $0x80, %rdx + movslq (%r11, %r9, 4), %r9 + add %r11, %r9 + jmp *%r9 + ud2 + + .p2align 4 +L(shl_0): + + mov %rdx, %r9 + shr $8, %r9 + add %rdx, %r9 +#ifdef DATA_CACHE_SIZE + cmp $DATA_CACHE_SIZE_HALF, %R9_LP +#else + cmp __x86_data_cache_size_half(%rip), %R9_LP +#endif + jae L(gobble_mem_fwd) + sub $0x80, %rdx + .p2align 4 +L(shl_0_loop): + movdqa (%rsi), %xmm1 + movdqa %xmm1, (%rdi) + movaps 0x10(%rsi), %xmm2 + movaps %xmm2, 0x10(%rdi) + movaps 0x20(%rsi), %xmm3 + movaps %xmm3, 0x20(%rdi) + movaps 0x30(%rsi), %xmm4 + movaps %xmm4, 0x30(%rdi) + movaps 0x40(%rsi), %xmm1 + movaps %xmm1, 0x40(%rdi) + movaps 0x50(%rsi), %xmm2 + movaps %xmm2, 0x50(%rdi) + movaps 0x60(%rsi), %xmm3 + movaps %xmm3, 0x60(%rdi) + movaps 0x70(%rsi), %xmm4 + movaps %xmm4, 0x70(%rdi) + sub $0x80, %rdx + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(shl_0_loop) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_0_bwd): + sub $0x80, %rdx +L(copy_backward_loop): + movaps -0x10(%rsi), %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps -0x20(%rsi), %xmm2 + movaps %xmm2, -0x20(%rdi) + movaps -0x30(%rsi), %xmm3 + movaps %xmm3, -0x30(%rdi) + movaps -0x40(%rsi), %xmm4 + movaps %xmm4, -0x40(%rdi) + movaps -0x50(%rsi), %xmm5 + movaps %xmm5, -0x50(%rdi) + movaps -0x60(%rsi), %xmm5 + movaps %xmm5, -0x60(%rdi) + movaps -0x70(%rsi), %xmm5 + movaps %xmm5, -0x70(%rdi) + movaps -0x80(%rsi), %xmm5 + movaps %xmm5, -0x80(%rdi) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(copy_backward_loop) + + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_1): + sub $0x80, %rdx + movaps -0x01(%rsi), %xmm1 + movaps 0x0f(%rsi), %xmm2 + movaps 0x1f(%rsi), %xmm3 + movaps 0x2f(%rsi), %xmm4 + movaps 0x3f(%rsi), %xmm5 + movaps 0x4f(%rsi), %xmm6 + movaps 0x5f(%rsi), %xmm7 + movaps 0x6f(%rsi), %xmm8 + movaps 0x7f(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $1, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $1, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $1, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $1, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $1, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $1, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $1, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_1) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_1_bwd): + movaps -0x01(%rsi), %xmm1 + + movaps -0x11(%rsi), %xmm2 + palignr $1, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x21(%rsi), %xmm3 + palignr $1, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x31(%rsi), %xmm4 + palignr $1, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x41(%rsi), %xmm5 + palignr $1, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x51(%rsi), %xmm6 + palignr $1, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x61(%rsi), %xmm7 + palignr $1, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x71(%rsi), %xmm8 + palignr $1, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x81(%rsi), %xmm9 + palignr $1, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_1_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_2): + sub $0x80, %rdx + movaps -0x02(%rsi), %xmm1 + movaps 0x0e(%rsi), %xmm2 + movaps 0x1e(%rsi), %xmm3 + movaps 0x2e(%rsi), %xmm4 + movaps 0x3e(%rsi), %xmm5 + movaps 0x4e(%rsi), %xmm6 + movaps 0x5e(%rsi), %xmm7 + movaps 0x6e(%rsi), %xmm8 + movaps 0x7e(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $2, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $2, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $2, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $2, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $2, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $2, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $2, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_2) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_2_bwd): + movaps -0x02(%rsi), %xmm1 + + movaps -0x12(%rsi), %xmm2 + palignr $2, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x22(%rsi), %xmm3 + palignr $2, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x32(%rsi), %xmm4 + palignr $2, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x42(%rsi), %xmm5 + palignr $2, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x52(%rsi), %xmm6 + palignr $2, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x62(%rsi), %xmm7 + palignr $2, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x72(%rsi), %xmm8 + palignr $2, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x82(%rsi), %xmm9 + palignr $2, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_2_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_3): + sub $0x80, %rdx + movaps -0x03(%rsi), %xmm1 + movaps 0x0d(%rsi), %xmm2 + movaps 0x1d(%rsi), %xmm3 + movaps 0x2d(%rsi), %xmm4 + movaps 0x3d(%rsi), %xmm5 + movaps 0x4d(%rsi), %xmm6 + movaps 0x5d(%rsi), %xmm7 + movaps 0x6d(%rsi), %xmm8 + movaps 0x7d(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $3, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $3, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $3, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $3, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $3, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $3, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $3, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_3) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_3_bwd): + movaps -0x03(%rsi), %xmm1 + + movaps -0x13(%rsi), %xmm2 + palignr $3, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x23(%rsi), %xmm3 + palignr $3, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x33(%rsi), %xmm4 + palignr $3, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x43(%rsi), %xmm5 + palignr $3, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x53(%rsi), %xmm6 + palignr $3, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x63(%rsi), %xmm7 + palignr $3, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x73(%rsi), %xmm8 + palignr $3, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x83(%rsi), %xmm9 + palignr $3, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_3_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_4): + sub $0x80, %rdx + movaps -0x04(%rsi), %xmm1 + movaps 0x0c(%rsi), %xmm2 + movaps 0x1c(%rsi), %xmm3 + movaps 0x2c(%rsi), %xmm4 + movaps 0x3c(%rsi), %xmm5 + movaps 0x4c(%rsi), %xmm6 + movaps 0x5c(%rsi), %xmm7 + movaps 0x6c(%rsi), %xmm8 + movaps 0x7c(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $4, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $4, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $4, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $4, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $4, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $4, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $4, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_4) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_4_bwd): + movaps -0x04(%rsi), %xmm1 + + movaps -0x14(%rsi), %xmm2 + palignr $4, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x24(%rsi), %xmm3 + palignr $4, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x34(%rsi), %xmm4 + palignr $4, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x44(%rsi), %xmm5 + palignr $4, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x54(%rsi), %xmm6 + palignr $4, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x64(%rsi), %xmm7 + palignr $4, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x74(%rsi), %xmm8 + palignr $4, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x84(%rsi), %xmm9 + palignr $4, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_4_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_5): + sub $0x80, %rdx + movaps -0x05(%rsi), %xmm1 + movaps 0x0b(%rsi), %xmm2 + movaps 0x1b(%rsi), %xmm3 + movaps 0x2b(%rsi), %xmm4 + movaps 0x3b(%rsi), %xmm5 + movaps 0x4b(%rsi), %xmm6 + movaps 0x5b(%rsi), %xmm7 + movaps 0x6b(%rsi), %xmm8 + movaps 0x7b(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $5, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $5, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $5, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $5, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $5, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $5, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $5, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_5) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_5_bwd): + movaps -0x05(%rsi), %xmm1 + + movaps -0x15(%rsi), %xmm2 + palignr $5, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x25(%rsi), %xmm3 + palignr $5, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x35(%rsi), %xmm4 + palignr $5, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x45(%rsi), %xmm5 + palignr $5, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x55(%rsi), %xmm6 + palignr $5, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x65(%rsi), %xmm7 + palignr $5, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x75(%rsi), %xmm8 + palignr $5, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x85(%rsi), %xmm9 + palignr $5, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_5_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_6): + sub $0x80, %rdx + movaps -0x06(%rsi), %xmm1 + movaps 0x0a(%rsi), %xmm2 + movaps 0x1a(%rsi), %xmm3 + movaps 0x2a(%rsi), %xmm4 + movaps 0x3a(%rsi), %xmm5 + movaps 0x4a(%rsi), %xmm6 + movaps 0x5a(%rsi), %xmm7 + movaps 0x6a(%rsi), %xmm8 + movaps 0x7a(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $6, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $6, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $6, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $6, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $6, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $6, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $6, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_6) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_6_bwd): + movaps -0x06(%rsi), %xmm1 + + movaps -0x16(%rsi), %xmm2 + palignr $6, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x26(%rsi), %xmm3 + palignr $6, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x36(%rsi), %xmm4 + palignr $6, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x46(%rsi), %xmm5 + palignr $6, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x56(%rsi), %xmm6 + palignr $6, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x66(%rsi), %xmm7 + palignr $6, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x76(%rsi), %xmm8 + palignr $6, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x86(%rsi), %xmm9 + palignr $6, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_6_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_7): + sub $0x80, %rdx + movaps -0x07(%rsi), %xmm1 + movaps 0x09(%rsi), %xmm2 + movaps 0x19(%rsi), %xmm3 + movaps 0x29(%rsi), %xmm4 + movaps 0x39(%rsi), %xmm5 + movaps 0x49(%rsi), %xmm6 + movaps 0x59(%rsi), %xmm7 + movaps 0x69(%rsi), %xmm8 + movaps 0x79(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $7, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $7, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $7, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $7, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $7, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $7, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $7, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_7) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_7_bwd): + movaps -0x07(%rsi), %xmm1 + + movaps -0x17(%rsi), %xmm2 + palignr $7, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x27(%rsi), %xmm3 + palignr $7, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x37(%rsi), %xmm4 + palignr $7, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x47(%rsi), %xmm5 + palignr $7, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x57(%rsi), %xmm6 + palignr $7, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x67(%rsi), %xmm7 + palignr $7, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x77(%rsi), %xmm8 + palignr $7, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x87(%rsi), %xmm9 + palignr $7, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_7_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_8): + sub $0x80, %rdx + movaps -0x08(%rsi), %xmm1 + movaps 0x08(%rsi), %xmm2 + movaps 0x18(%rsi), %xmm3 + movaps 0x28(%rsi), %xmm4 + movaps 0x38(%rsi), %xmm5 + movaps 0x48(%rsi), %xmm6 + movaps 0x58(%rsi), %xmm7 + movaps 0x68(%rsi), %xmm8 + movaps 0x78(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $8, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $8, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $8, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $8, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $8, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $8, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $8, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_8) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_8_bwd): + movaps -0x08(%rsi), %xmm1 + + movaps -0x18(%rsi), %xmm2 + palignr $8, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x28(%rsi), %xmm3 + palignr $8, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x38(%rsi), %xmm4 + palignr $8, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x48(%rsi), %xmm5 + palignr $8, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x58(%rsi), %xmm6 + palignr $8, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x68(%rsi), %xmm7 + palignr $8, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x78(%rsi), %xmm8 + palignr $8, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x88(%rsi), %xmm9 + palignr $8, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_8_bwd) +L(shl_8_end_bwd): + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_9): + sub $0x80, %rdx + movaps -0x09(%rsi), %xmm1 + movaps 0x07(%rsi), %xmm2 + movaps 0x17(%rsi), %xmm3 + movaps 0x27(%rsi), %xmm4 + movaps 0x37(%rsi), %xmm5 + movaps 0x47(%rsi), %xmm6 + movaps 0x57(%rsi), %xmm7 + movaps 0x67(%rsi), %xmm8 + movaps 0x77(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $9, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $9, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $9, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $9, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $9, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $9, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $9, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_9) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_9_bwd): + movaps -0x09(%rsi), %xmm1 + + movaps -0x19(%rsi), %xmm2 + palignr $9, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x29(%rsi), %xmm3 + palignr $9, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x39(%rsi), %xmm4 + palignr $9, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x49(%rsi), %xmm5 + palignr $9, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x59(%rsi), %xmm6 + palignr $9, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x69(%rsi), %xmm7 + palignr $9, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x79(%rsi), %xmm8 + palignr $9, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x89(%rsi), %xmm9 + palignr $9, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_9_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_10): + sub $0x80, %rdx + movaps -0x0a(%rsi), %xmm1 + movaps 0x06(%rsi), %xmm2 + movaps 0x16(%rsi), %xmm3 + movaps 0x26(%rsi), %xmm4 + movaps 0x36(%rsi), %xmm5 + movaps 0x46(%rsi), %xmm6 + movaps 0x56(%rsi), %xmm7 + movaps 0x66(%rsi), %xmm8 + movaps 0x76(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $10, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $10, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $10, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $10, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $10, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $10, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $10, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_10) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_10_bwd): + movaps -0x0a(%rsi), %xmm1 + + movaps -0x1a(%rsi), %xmm2 + palignr $10, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2a(%rsi), %xmm3 + palignr $10, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3a(%rsi), %xmm4 + palignr $10, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4a(%rsi), %xmm5 + palignr $10, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5a(%rsi), %xmm6 + palignr $10, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6a(%rsi), %xmm7 + palignr $10, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7a(%rsi), %xmm8 + palignr $10, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8a(%rsi), %xmm9 + palignr $10, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_10_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_11): + sub $0x80, %rdx + movaps -0x0b(%rsi), %xmm1 + movaps 0x05(%rsi), %xmm2 + movaps 0x15(%rsi), %xmm3 + movaps 0x25(%rsi), %xmm4 + movaps 0x35(%rsi), %xmm5 + movaps 0x45(%rsi), %xmm6 + movaps 0x55(%rsi), %xmm7 + movaps 0x65(%rsi), %xmm8 + movaps 0x75(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $11, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $11, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $11, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $11, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $11, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $11, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $11, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_11) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_11_bwd): + movaps -0x0b(%rsi), %xmm1 + + movaps -0x1b(%rsi), %xmm2 + palignr $11, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2b(%rsi), %xmm3 + palignr $11, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3b(%rsi), %xmm4 + palignr $11, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4b(%rsi), %xmm5 + palignr $11, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5b(%rsi), %xmm6 + palignr $11, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6b(%rsi), %xmm7 + palignr $11, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7b(%rsi), %xmm8 + palignr $11, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8b(%rsi), %xmm9 + palignr $11, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_11_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_12): + sub $0x80, %rdx + movdqa -0x0c(%rsi), %xmm1 + movaps 0x04(%rsi), %xmm2 + movaps 0x14(%rsi), %xmm3 + movaps 0x24(%rsi), %xmm4 + movaps 0x34(%rsi), %xmm5 + movaps 0x44(%rsi), %xmm6 + movaps 0x54(%rsi), %xmm7 + movaps 0x64(%rsi), %xmm8 + movaps 0x74(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $12, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $12, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $12, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $12, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $12, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $12, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $12, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + + lea 0x80(%rdi), %rdi + jae L(shl_12) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_12_bwd): + movaps -0x0c(%rsi), %xmm1 + + movaps -0x1c(%rsi), %xmm2 + palignr $12, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2c(%rsi), %xmm3 + palignr $12, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3c(%rsi), %xmm4 + palignr $12, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4c(%rsi), %xmm5 + palignr $12, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5c(%rsi), %xmm6 + palignr $12, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6c(%rsi), %xmm7 + palignr $12, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7c(%rsi), %xmm8 + palignr $12, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8c(%rsi), %xmm9 + palignr $12, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_12_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_13): + sub $0x80, %rdx + movaps -0x0d(%rsi), %xmm1 + movaps 0x03(%rsi), %xmm2 + movaps 0x13(%rsi), %xmm3 + movaps 0x23(%rsi), %xmm4 + movaps 0x33(%rsi), %xmm5 + movaps 0x43(%rsi), %xmm6 + movaps 0x53(%rsi), %xmm7 + movaps 0x63(%rsi), %xmm8 + movaps 0x73(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $13, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $13, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $13, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $13, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $13, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $13, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $13, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_13) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_13_bwd): + movaps -0x0d(%rsi), %xmm1 + + movaps -0x1d(%rsi), %xmm2 + palignr $13, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2d(%rsi), %xmm3 + palignr $13, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3d(%rsi), %xmm4 + palignr $13, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4d(%rsi), %xmm5 + palignr $13, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5d(%rsi), %xmm6 + palignr $13, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6d(%rsi), %xmm7 + palignr $13, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7d(%rsi), %xmm8 + palignr $13, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8d(%rsi), %xmm9 + palignr $13, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_13_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_14): + sub $0x80, %rdx + movaps -0x0e(%rsi), %xmm1 + movaps 0x02(%rsi), %xmm2 + movaps 0x12(%rsi), %xmm3 + movaps 0x22(%rsi), %xmm4 + movaps 0x32(%rsi), %xmm5 + movaps 0x42(%rsi), %xmm6 + movaps 0x52(%rsi), %xmm7 + movaps 0x62(%rsi), %xmm8 + movaps 0x72(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $14, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $14, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $14, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $14, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $14, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $14, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $14, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_14) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_14_bwd): + movaps -0x0e(%rsi), %xmm1 + + movaps -0x1e(%rsi), %xmm2 + palignr $14, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2e(%rsi), %xmm3 + palignr $14, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3e(%rsi), %xmm4 + palignr $14, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4e(%rsi), %xmm5 + palignr $14, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5e(%rsi), %xmm6 + palignr $14, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6e(%rsi), %xmm7 + palignr $14, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7e(%rsi), %xmm8 + palignr $14, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8e(%rsi), %xmm9 + palignr $14, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_14_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(shl_15): + sub $0x80, %rdx + movaps -0x0f(%rsi), %xmm1 + movaps 0x01(%rsi), %xmm2 + movaps 0x11(%rsi), %xmm3 + movaps 0x21(%rsi), %xmm4 + movaps 0x31(%rsi), %xmm5 + movaps 0x41(%rsi), %xmm6 + movaps 0x51(%rsi), %xmm7 + movaps 0x61(%rsi), %xmm8 + movaps 0x71(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $15, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $15, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $15, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $15, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $15, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $15, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $15, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_15) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(shl_15_bwd): + movaps -0x0f(%rsi), %xmm1 + + movaps -0x1f(%rsi), %xmm2 + palignr $15, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x2f(%rsi), %xmm3 + palignr $15, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x3f(%rsi), %xmm4 + palignr $15, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x4f(%rsi), %xmm5 + palignr $15, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x5f(%rsi), %xmm6 + palignr $15, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x6f(%rsi), %xmm7 + palignr $15, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x7f(%rsi), %xmm8 + palignr $15, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x8f(%rsi), %xmm9 + palignr $15, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_15_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(gobble_mem_fwd): + movdqu (%rsi), %xmm1 + movdqu %xmm0, (%r8) + movdqa %xmm1, (%rdi) + sub $16, %rdx + add $16, %rsi + add $16, %rdi + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif +#ifdef USE_AS_MEMMOVE + mov %rsi, %r9 + sub %rdi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_fwd) + cmp %rcx, %r9 + jbe L(ll_cache_copy_fwd_start) +L(memmove_is_memcpy_fwd): +#endif + cmp %rcx, %rdx + ja L(bigger_in_fwd) + mov %rdx, %rcx +L(bigger_in_fwd): + sub %rcx, %rdx + cmp $0x1000, %rdx + jbe L(ll_cache_copy_fwd) + + mov %rcx, %r9 + shl $3, %r9 + cmp %r9, %rdx + jbe L(2steps_copy_fwd) + add %rcx, %rdx + xor %rcx, %rcx +L(2steps_copy_fwd): + sub $0x80, %rdx +L(gobble_mem_fwd_loop): + sub $0x80, %rdx + prefetcht0 0x200(%rsi) + prefetcht0 0x300(%rsi) + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lfence + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + movntdq %xmm4, 0x40(%rdi) + movntdq %xmm5, 0x50(%rdi) + movntdq %xmm6, 0x60(%rdi) + movntdq %xmm7, 0x70(%rdi) + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(gobble_mem_fwd_loop) + sfence + cmp $0x80, %rcx + jb L(gobble_mem_fwd_end) + add $0x80, %rdx +L(ll_cache_copy_fwd): + add %rcx, %rdx +L(ll_cache_copy_fwd_start): + sub $0x80, %rdx +L(gobble_ll_loop_fwd): + prefetchnta 0x1c0(%rsi) + prefetchnta 0x280(%rsi) + prefetchnta 0x1c0(%rdi) + prefetchnta 0x280(%rdi) + sub $0x80, %rdx + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + movdqa %xmm2, 0x20(%rdi) + movdqa %xmm3, 0x30(%rdi) + movdqa %xmm4, 0x40(%rdi) + movdqa %xmm5, 0x50(%rdi) + movdqa %xmm6, 0x60(%rdi) + movdqa %xmm7, 0x70(%rdi) + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(gobble_ll_loop_fwd) +L(gobble_mem_fwd_end): + add $0x80, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + .p2align 4 +L(gobble_mem_bwd): + add %rdx, %rsi + add %rdx, %rdi + + movdqu -16(%rsi), %xmm0 + lea -16(%rdi), %r8 + mov %rdi, %r9 + and $-16, %rdi + sub %rdi, %r9 + sub %r9, %rsi + sub %r9, %rdx + + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif +#ifdef USE_AS_MEMMOVE + mov %rdi, %r9 + sub %rsi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_bwd) + cmp %rcx, %r9 + jbe L(ll_cache_copy_bwd_start) +L(memmove_is_memcpy_bwd): +#endif + cmp %rcx, %rdx + ja L(bigger) + mov %rdx, %rcx +L(bigger): + sub %rcx, %rdx + cmp $0x1000, %rdx + jbe L(ll_cache_copy) + + mov %rcx, %r9 + shl $3, %r9 + cmp %r9, %rdx + jbe L(2steps_copy) + add %rcx, %rdx + xor %rcx, %rcx +L(2steps_copy): + sub $0x80, %rdx +L(gobble_mem_bwd_loop): + sub $0x80, %rdx + prefetcht0 -0x200(%rsi) + prefetcht0 -0x300(%rsi) + movdqu -0x10(%rsi), %xmm1 + movdqu -0x20(%rsi), %xmm2 + movdqu -0x30(%rsi), %xmm3 + movdqu -0x40(%rsi), %xmm4 + movdqu -0x50(%rsi), %xmm5 + movdqu -0x60(%rsi), %xmm6 + movdqu -0x70(%rsi), %xmm7 + movdqu -0x80(%rsi), %xmm8 + lfence + movntdq %xmm1, -0x10(%rdi) + movntdq %xmm2, -0x20(%rdi) + movntdq %xmm3, -0x30(%rdi) + movntdq %xmm4, -0x40(%rdi) + movntdq %xmm5, -0x50(%rdi) + movntdq %xmm6, -0x60(%rdi) + movntdq %xmm7, -0x70(%rdi) + movntdq %xmm8, -0x80(%rdi) + lea -0x80(%rsi), %rsi + lea -0x80(%rdi), %rdi + jae L(gobble_mem_bwd_loop) + sfence + cmp $0x80, %rcx + jb L(gobble_mem_bwd_end) + add $0x80, %rdx +L(ll_cache_copy): + add %rcx, %rdx +L(ll_cache_copy_bwd_start): + sub $0x80, %rdx +L(gobble_ll_loop): + prefetchnta -0x1c0(%rsi) + prefetchnta -0x280(%rsi) + prefetchnta -0x1c0(%rdi) + prefetchnta -0x280(%rdi) + sub $0x80, %rdx + movdqu -0x10(%rsi), %xmm1 + movdqu -0x20(%rsi), %xmm2 + movdqu -0x30(%rsi), %xmm3 + movdqu -0x40(%rsi), %xmm4 + movdqu -0x50(%rsi), %xmm5 + movdqu -0x60(%rsi), %xmm6 + movdqu -0x70(%rsi), %xmm7 + movdqu -0x80(%rsi), %xmm8 + movdqa %xmm1, -0x10(%rdi) + movdqa %xmm2, -0x20(%rdi) + movdqa %xmm3, -0x30(%rdi) + movdqa %xmm4, -0x40(%rdi) + movdqa %xmm5, -0x50(%rdi) + movdqa %xmm6, -0x60(%rdi) + movdqa %xmm7, -0x70(%rdi) + movdqa %xmm8, -0x80(%rdi) + lea -0x80(%rsi), %rsi + lea -0x80(%rdi), %rdi + jae L(gobble_ll_loop) +L(gobble_mem_bwd_end): + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rsi + sub %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + .p2align 4 +L(fwd_write_128bytes): + lddqu -128(%rsi), %xmm0 + movdqu %xmm0, -128(%rdi) +L(fwd_write_112bytes): + lddqu -112(%rsi), %xmm0 + movdqu %xmm0, -112(%rdi) +L(fwd_write_96bytes): + lddqu -96(%rsi), %xmm0 + movdqu %xmm0, -96(%rdi) +L(fwd_write_80bytes): + lddqu -80(%rsi), %xmm0 + movdqu %xmm0, -80(%rdi) +L(fwd_write_64bytes): + lddqu -64(%rsi), %xmm0 + movdqu %xmm0, -64(%rdi) +L(fwd_write_48bytes): + lddqu -48(%rsi), %xmm0 + movdqu %xmm0, -48(%rdi) +L(fwd_write_32bytes): + lddqu -32(%rsi), %xmm0 + movdqu %xmm0, -32(%rdi) +L(fwd_write_16bytes): + lddqu -16(%rsi), %xmm0 + movdqu %xmm0, -16(%rdi) +L(fwd_write_0bytes): + ret + + + .p2align 4 +L(fwd_write_143bytes): + lddqu -143(%rsi), %xmm0 + movdqu %xmm0, -143(%rdi) +L(fwd_write_127bytes): + lddqu -127(%rsi), %xmm0 + movdqu %xmm0, -127(%rdi) +L(fwd_write_111bytes): + lddqu -111(%rsi), %xmm0 + movdqu %xmm0, -111(%rdi) +L(fwd_write_95bytes): + lddqu -95(%rsi), %xmm0 + movdqu %xmm0, -95(%rdi) +L(fwd_write_79bytes): + lddqu -79(%rsi), %xmm0 + movdqu %xmm0, -79(%rdi) +L(fwd_write_63bytes): + lddqu -63(%rsi), %xmm0 + movdqu %xmm0, -63(%rdi) +L(fwd_write_47bytes): + lddqu -47(%rsi), %xmm0 + movdqu %xmm0, -47(%rdi) +L(fwd_write_31bytes): + lddqu -31(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -31(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_15bytes): + mov -15(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -15(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_142bytes): + lddqu -142(%rsi), %xmm0 + movdqu %xmm0, -142(%rdi) +L(fwd_write_126bytes): + lddqu -126(%rsi), %xmm0 + movdqu %xmm0, -126(%rdi) +L(fwd_write_110bytes): + lddqu -110(%rsi), %xmm0 + movdqu %xmm0, -110(%rdi) +L(fwd_write_94bytes): + lddqu -94(%rsi), %xmm0 + movdqu %xmm0, -94(%rdi) +L(fwd_write_78bytes): + lddqu -78(%rsi), %xmm0 + movdqu %xmm0, -78(%rdi) +L(fwd_write_62bytes): + lddqu -62(%rsi), %xmm0 + movdqu %xmm0, -62(%rdi) +L(fwd_write_46bytes): + lddqu -46(%rsi), %xmm0 + movdqu %xmm0, -46(%rdi) +L(fwd_write_30bytes): + lddqu -30(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -30(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_14bytes): + mov -14(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -14(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_141bytes): + lddqu -141(%rsi), %xmm0 + movdqu %xmm0, -141(%rdi) +L(fwd_write_125bytes): + lddqu -125(%rsi), %xmm0 + movdqu %xmm0, -125(%rdi) +L(fwd_write_109bytes): + lddqu -109(%rsi), %xmm0 + movdqu %xmm0, -109(%rdi) +L(fwd_write_93bytes): + lddqu -93(%rsi), %xmm0 + movdqu %xmm0, -93(%rdi) +L(fwd_write_77bytes): + lddqu -77(%rsi), %xmm0 + movdqu %xmm0, -77(%rdi) +L(fwd_write_61bytes): + lddqu -61(%rsi), %xmm0 + movdqu %xmm0, -61(%rdi) +L(fwd_write_45bytes): + lddqu -45(%rsi), %xmm0 + movdqu %xmm0, -45(%rdi) +L(fwd_write_29bytes): + lddqu -29(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -29(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_13bytes): + mov -13(%rsi), %rdx + mov -8(%rsi), %rcx + mov %rdx, -13(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_140bytes): + lddqu -140(%rsi), %xmm0 + movdqu %xmm0, -140(%rdi) +L(fwd_write_124bytes): + lddqu -124(%rsi), %xmm0 + movdqu %xmm0, -124(%rdi) +L(fwd_write_108bytes): + lddqu -108(%rsi), %xmm0 + movdqu %xmm0, -108(%rdi) +L(fwd_write_92bytes): + lddqu -92(%rsi), %xmm0 + movdqu %xmm0, -92(%rdi) +L(fwd_write_76bytes): + lddqu -76(%rsi), %xmm0 + movdqu %xmm0, -76(%rdi) +L(fwd_write_60bytes): + lddqu -60(%rsi), %xmm0 + movdqu %xmm0, -60(%rdi) +L(fwd_write_44bytes): + lddqu -44(%rsi), %xmm0 + movdqu %xmm0, -44(%rdi) +L(fwd_write_28bytes): + lddqu -28(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -28(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_12bytes): + mov -12(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -12(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_139bytes): + lddqu -139(%rsi), %xmm0 + movdqu %xmm0, -139(%rdi) +L(fwd_write_123bytes): + lddqu -123(%rsi), %xmm0 + movdqu %xmm0, -123(%rdi) +L(fwd_write_107bytes): + lddqu -107(%rsi), %xmm0 + movdqu %xmm0, -107(%rdi) +L(fwd_write_91bytes): + lddqu -91(%rsi), %xmm0 + movdqu %xmm0, -91(%rdi) +L(fwd_write_75bytes): + lddqu -75(%rsi), %xmm0 + movdqu %xmm0, -75(%rdi) +L(fwd_write_59bytes): + lddqu -59(%rsi), %xmm0 + movdqu %xmm0, -59(%rdi) +L(fwd_write_43bytes): + lddqu -43(%rsi), %xmm0 + movdqu %xmm0, -43(%rdi) +L(fwd_write_27bytes): + lddqu -27(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -27(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_11bytes): + mov -11(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -11(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_138bytes): + lddqu -138(%rsi), %xmm0 + movdqu %xmm0, -138(%rdi) +L(fwd_write_122bytes): + lddqu -122(%rsi), %xmm0 + movdqu %xmm0, -122(%rdi) +L(fwd_write_106bytes): + lddqu -106(%rsi), %xmm0 + movdqu %xmm0, -106(%rdi) +L(fwd_write_90bytes): + lddqu -90(%rsi), %xmm0 + movdqu %xmm0, -90(%rdi) +L(fwd_write_74bytes): + lddqu -74(%rsi), %xmm0 + movdqu %xmm0, -74(%rdi) +L(fwd_write_58bytes): + lddqu -58(%rsi), %xmm0 + movdqu %xmm0, -58(%rdi) +L(fwd_write_42bytes): + lddqu -42(%rsi), %xmm0 + movdqu %xmm0, -42(%rdi) +L(fwd_write_26bytes): + lddqu -26(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -26(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_10bytes): + mov -10(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -10(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_137bytes): + lddqu -137(%rsi), %xmm0 + movdqu %xmm0, -137(%rdi) +L(fwd_write_121bytes): + lddqu -121(%rsi), %xmm0 + movdqu %xmm0, -121(%rdi) +L(fwd_write_105bytes): + lddqu -105(%rsi), %xmm0 + movdqu %xmm0, -105(%rdi) +L(fwd_write_89bytes): + lddqu -89(%rsi), %xmm0 + movdqu %xmm0, -89(%rdi) +L(fwd_write_73bytes): + lddqu -73(%rsi), %xmm0 + movdqu %xmm0, -73(%rdi) +L(fwd_write_57bytes): + lddqu -57(%rsi), %xmm0 + movdqu %xmm0, -57(%rdi) +L(fwd_write_41bytes): + lddqu -41(%rsi), %xmm0 + movdqu %xmm0, -41(%rdi) +L(fwd_write_25bytes): + lddqu -25(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -25(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_9bytes): + mov -9(%rsi), %rdx + mov -4(%rsi), %ecx + mov %rdx, -9(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_136bytes): + lddqu -136(%rsi), %xmm0 + movdqu %xmm0, -136(%rdi) +L(fwd_write_120bytes): + lddqu -120(%rsi), %xmm0 + movdqu %xmm0, -120(%rdi) +L(fwd_write_104bytes): + lddqu -104(%rsi), %xmm0 + movdqu %xmm0, -104(%rdi) +L(fwd_write_88bytes): + lddqu -88(%rsi), %xmm0 + movdqu %xmm0, -88(%rdi) +L(fwd_write_72bytes): + lddqu -72(%rsi), %xmm0 + movdqu %xmm0, -72(%rdi) +L(fwd_write_56bytes): + lddqu -56(%rsi), %xmm0 + movdqu %xmm0, -56(%rdi) +L(fwd_write_40bytes): + lddqu -40(%rsi), %xmm0 + movdqu %xmm0, -40(%rdi) +L(fwd_write_24bytes): + lddqu -24(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -24(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_8bytes): + mov -8(%rsi), %rdx + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(fwd_write_135bytes): + lddqu -135(%rsi), %xmm0 + movdqu %xmm0, -135(%rdi) +L(fwd_write_119bytes): + lddqu -119(%rsi), %xmm0 + movdqu %xmm0, -119(%rdi) +L(fwd_write_103bytes): + lddqu -103(%rsi), %xmm0 + movdqu %xmm0, -103(%rdi) +L(fwd_write_87bytes): + lddqu -87(%rsi), %xmm0 + movdqu %xmm0, -87(%rdi) +L(fwd_write_71bytes): + lddqu -71(%rsi), %xmm0 + movdqu %xmm0, -71(%rdi) +L(fwd_write_55bytes): + lddqu -55(%rsi), %xmm0 + movdqu %xmm0, -55(%rdi) +L(fwd_write_39bytes): + lddqu -39(%rsi), %xmm0 + movdqu %xmm0, -39(%rdi) +L(fwd_write_23bytes): + lddqu -23(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -23(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_7bytes): + mov -7(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -7(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_134bytes): + lddqu -134(%rsi), %xmm0 + movdqu %xmm0, -134(%rdi) +L(fwd_write_118bytes): + lddqu -118(%rsi), %xmm0 + movdqu %xmm0, -118(%rdi) +L(fwd_write_102bytes): + lddqu -102(%rsi), %xmm0 + movdqu %xmm0, -102(%rdi) +L(fwd_write_86bytes): + lddqu -86(%rsi), %xmm0 + movdqu %xmm0, -86(%rdi) +L(fwd_write_70bytes): + lddqu -70(%rsi), %xmm0 + movdqu %xmm0, -70(%rdi) +L(fwd_write_54bytes): + lddqu -54(%rsi), %xmm0 + movdqu %xmm0, -54(%rdi) +L(fwd_write_38bytes): + lddqu -38(%rsi), %xmm0 + movdqu %xmm0, -38(%rdi) +L(fwd_write_22bytes): + lddqu -22(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -22(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_6bytes): + mov -6(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -6(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_133bytes): + lddqu -133(%rsi), %xmm0 + movdqu %xmm0, -133(%rdi) +L(fwd_write_117bytes): + lddqu -117(%rsi), %xmm0 + movdqu %xmm0, -117(%rdi) +L(fwd_write_101bytes): + lddqu -101(%rsi), %xmm0 + movdqu %xmm0, -101(%rdi) +L(fwd_write_85bytes): + lddqu -85(%rsi), %xmm0 + movdqu %xmm0, -85(%rdi) +L(fwd_write_69bytes): + lddqu -69(%rsi), %xmm0 + movdqu %xmm0, -69(%rdi) +L(fwd_write_53bytes): + lddqu -53(%rsi), %xmm0 + movdqu %xmm0, -53(%rdi) +L(fwd_write_37bytes): + lddqu -37(%rsi), %xmm0 + movdqu %xmm0, -37(%rdi) +L(fwd_write_21bytes): + lddqu -21(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -21(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_5bytes): + mov -5(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -5(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_132bytes): + lddqu -132(%rsi), %xmm0 + movdqu %xmm0, -132(%rdi) +L(fwd_write_116bytes): + lddqu -116(%rsi), %xmm0 + movdqu %xmm0, -116(%rdi) +L(fwd_write_100bytes): + lddqu -100(%rsi), %xmm0 + movdqu %xmm0, -100(%rdi) +L(fwd_write_84bytes): + lddqu -84(%rsi), %xmm0 + movdqu %xmm0, -84(%rdi) +L(fwd_write_68bytes): + lddqu -68(%rsi), %xmm0 + movdqu %xmm0, -68(%rdi) +L(fwd_write_52bytes): + lddqu -52(%rsi), %xmm0 + movdqu %xmm0, -52(%rdi) +L(fwd_write_36bytes): + lddqu -36(%rsi), %xmm0 + movdqu %xmm0, -36(%rdi) +L(fwd_write_20bytes): + lddqu -20(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -20(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_4bytes): + mov -4(%rsi), %edx + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(fwd_write_131bytes): + lddqu -131(%rsi), %xmm0 + movdqu %xmm0, -131(%rdi) +L(fwd_write_115bytes): + lddqu -115(%rsi), %xmm0 + movdqu %xmm0, -115(%rdi) +L(fwd_write_99bytes): + lddqu -99(%rsi), %xmm0 + movdqu %xmm0, -99(%rdi) +L(fwd_write_83bytes): + lddqu -83(%rsi), %xmm0 + movdqu %xmm0, -83(%rdi) +L(fwd_write_67bytes): + lddqu -67(%rsi), %xmm0 + movdqu %xmm0, -67(%rdi) +L(fwd_write_51bytes): + lddqu -51(%rsi), %xmm0 + movdqu %xmm0, -51(%rdi) +L(fwd_write_35bytes): + lddqu -35(%rsi), %xmm0 + movdqu %xmm0, -35(%rdi) +L(fwd_write_19bytes): + lddqu -19(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -19(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_3bytes): + mov -3(%rsi), %dx + mov -2(%rsi), %cx + mov %dx, -3(%rdi) + mov %cx, -2(%rdi) + ret + + .p2align 4 +L(fwd_write_130bytes): + lddqu -130(%rsi), %xmm0 + movdqu %xmm0, -130(%rdi) +L(fwd_write_114bytes): + lddqu -114(%rsi), %xmm0 + movdqu %xmm0, -114(%rdi) +L(fwd_write_98bytes): + lddqu -98(%rsi), %xmm0 + movdqu %xmm0, -98(%rdi) +L(fwd_write_82bytes): + lddqu -82(%rsi), %xmm0 + movdqu %xmm0, -82(%rdi) +L(fwd_write_66bytes): + lddqu -66(%rsi), %xmm0 + movdqu %xmm0, -66(%rdi) +L(fwd_write_50bytes): + lddqu -50(%rsi), %xmm0 + movdqu %xmm0, -50(%rdi) +L(fwd_write_34bytes): + lddqu -34(%rsi), %xmm0 + movdqu %xmm0, -34(%rdi) +L(fwd_write_18bytes): + lddqu -18(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -18(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_2bytes): + movzwl -2(%rsi), %edx + mov %dx, -2(%rdi) + ret + + .p2align 4 +L(fwd_write_129bytes): + lddqu -129(%rsi), %xmm0 + movdqu %xmm0, -129(%rdi) +L(fwd_write_113bytes): + lddqu -113(%rsi), %xmm0 + movdqu %xmm0, -113(%rdi) +L(fwd_write_97bytes): + lddqu -97(%rsi), %xmm0 + movdqu %xmm0, -97(%rdi) +L(fwd_write_81bytes): + lddqu -81(%rsi), %xmm0 + movdqu %xmm0, -81(%rdi) +L(fwd_write_65bytes): + lddqu -65(%rsi), %xmm0 + movdqu %xmm0, -65(%rdi) +L(fwd_write_49bytes): + lddqu -49(%rsi), %xmm0 + movdqu %xmm0, -49(%rdi) +L(fwd_write_33bytes): + lddqu -33(%rsi), %xmm0 + movdqu %xmm0, -33(%rdi) +L(fwd_write_17bytes): + lddqu -17(%rsi), %xmm0 + lddqu -16(%rsi), %xmm1 + movdqu %xmm0, -17(%rdi) + movdqu %xmm1, -16(%rdi) + ret + + .p2align 4 +L(fwd_write_1bytes): + movzbl -1(%rsi), %edx + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(bwd_write_128bytes): + lddqu 112(%rsi), %xmm0 + movdqu %xmm0, 112(%rdi) +L(bwd_write_112bytes): + lddqu 96(%rsi), %xmm0 + movdqu %xmm0, 96(%rdi) +L(bwd_write_96bytes): + lddqu 80(%rsi), %xmm0 + movdqu %xmm0, 80(%rdi) +L(bwd_write_80bytes): + lddqu 64(%rsi), %xmm0 + movdqu %xmm0, 64(%rdi) +L(bwd_write_64bytes): + lddqu 48(%rsi), %xmm0 + movdqu %xmm0, 48(%rdi) +L(bwd_write_48bytes): + lddqu 32(%rsi), %xmm0 + movdqu %xmm0, 32(%rdi) +L(bwd_write_32bytes): + lddqu 16(%rsi), %xmm0 + movdqu %xmm0, 16(%rdi) +L(bwd_write_16bytes): + lddqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +L(bwd_write_0bytes): + ret + + .p2align 4 +L(bwd_write_143bytes): + lddqu 127(%rsi), %xmm0 + movdqu %xmm0, 127(%rdi) +L(bwd_write_127bytes): + lddqu 111(%rsi), %xmm0 + movdqu %xmm0, 111(%rdi) +L(bwd_write_111bytes): + lddqu 95(%rsi), %xmm0 + movdqu %xmm0, 95(%rdi) +L(bwd_write_95bytes): + lddqu 79(%rsi), %xmm0 + movdqu %xmm0, 79(%rdi) +L(bwd_write_79bytes): + lddqu 63(%rsi), %xmm0 + movdqu %xmm0, 63(%rdi) +L(bwd_write_63bytes): + lddqu 47(%rsi), %xmm0 + movdqu %xmm0, 47(%rdi) +L(bwd_write_47bytes): + lddqu 31(%rsi), %xmm0 + movdqu %xmm0, 31(%rdi) +L(bwd_write_31bytes): + lddqu 15(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 15(%rdi) + movdqu %xmm1, (%rdi) + ret + + + .p2align 4 +L(bwd_write_15bytes): + mov 7(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 7(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_142bytes): + lddqu 126(%rsi), %xmm0 + movdqu %xmm0, 126(%rdi) +L(bwd_write_126bytes): + lddqu 110(%rsi), %xmm0 + movdqu %xmm0, 110(%rdi) +L(bwd_write_110bytes): + lddqu 94(%rsi), %xmm0 + movdqu %xmm0, 94(%rdi) +L(bwd_write_94bytes): + lddqu 78(%rsi), %xmm0 + movdqu %xmm0, 78(%rdi) +L(bwd_write_78bytes): + lddqu 62(%rsi), %xmm0 + movdqu %xmm0, 62(%rdi) +L(bwd_write_62bytes): + lddqu 46(%rsi), %xmm0 + movdqu %xmm0, 46(%rdi) +L(bwd_write_46bytes): + lddqu 30(%rsi), %xmm0 + movdqu %xmm0, 30(%rdi) +L(bwd_write_30bytes): + lddqu 14(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 14(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_14bytes): + mov 6(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 6(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_141bytes): + lddqu 125(%rsi), %xmm0 + movdqu %xmm0, 125(%rdi) +L(bwd_write_125bytes): + lddqu 109(%rsi), %xmm0 + movdqu %xmm0, 109(%rdi) +L(bwd_write_109bytes): + lddqu 93(%rsi), %xmm0 + movdqu %xmm0, 93(%rdi) +L(bwd_write_93bytes): + lddqu 77(%rsi), %xmm0 + movdqu %xmm0, 77(%rdi) +L(bwd_write_77bytes): + lddqu 61(%rsi), %xmm0 + movdqu %xmm0, 61(%rdi) +L(bwd_write_61bytes): + lddqu 45(%rsi), %xmm0 + movdqu %xmm0, 45(%rdi) +L(bwd_write_45bytes): + lddqu 29(%rsi), %xmm0 + movdqu %xmm0, 29(%rdi) +L(bwd_write_29bytes): + lddqu 13(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 13(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_13bytes): + mov 5(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 5(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_140bytes): + lddqu 124(%rsi), %xmm0 + movdqu %xmm0, 124(%rdi) +L(bwd_write_124bytes): + lddqu 108(%rsi), %xmm0 + movdqu %xmm0, 108(%rdi) +L(bwd_write_108bytes): + lddqu 92(%rsi), %xmm0 + movdqu %xmm0, 92(%rdi) +L(bwd_write_92bytes): + lddqu 76(%rsi), %xmm0 + movdqu %xmm0, 76(%rdi) +L(bwd_write_76bytes): + lddqu 60(%rsi), %xmm0 + movdqu %xmm0, 60(%rdi) +L(bwd_write_60bytes): + lddqu 44(%rsi), %xmm0 + movdqu %xmm0, 44(%rdi) +L(bwd_write_44bytes): + lddqu 28(%rsi), %xmm0 + movdqu %xmm0, 28(%rdi) +L(bwd_write_28bytes): + lddqu 12(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 12(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_12bytes): + mov 4(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 4(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_139bytes): + lddqu 123(%rsi), %xmm0 + movdqu %xmm0, 123(%rdi) +L(bwd_write_123bytes): + lddqu 107(%rsi), %xmm0 + movdqu %xmm0, 107(%rdi) +L(bwd_write_107bytes): + lddqu 91(%rsi), %xmm0 + movdqu %xmm0, 91(%rdi) +L(bwd_write_91bytes): + lddqu 75(%rsi), %xmm0 + movdqu %xmm0, 75(%rdi) +L(bwd_write_75bytes): + lddqu 59(%rsi), %xmm0 + movdqu %xmm0, 59(%rdi) +L(bwd_write_59bytes): + lddqu 43(%rsi), %xmm0 + movdqu %xmm0, 43(%rdi) +L(bwd_write_43bytes): + lddqu 27(%rsi), %xmm0 + movdqu %xmm0, 27(%rdi) +L(bwd_write_27bytes): + lddqu 11(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 11(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_11bytes): + mov 3(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 3(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_138bytes): + lddqu 122(%rsi), %xmm0 + movdqu %xmm0, 122(%rdi) +L(bwd_write_122bytes): + lddqu 106(%rsi), %xmm0 + movdqu %xmm0, 106(%rdi) +L(bwd_write_106bytes): + lddqu 90(%rsi), %xmm0 + movdqu %xmm0, 90(%rdi) +L(bwd_write_90bytes): + lddqu 74(%rsi), %xmm0 + movdqu %xmm0, 74(%rdi) +L(bwd_write_74bytes): + lddqu 58(%rsi), %xmm0 + movdqu %xmm0, 58(%rdi) +L(bwd_write_58bytes): + lddqu 42(%rsi), %xmm0 + movdqu %xmm0, 42(%rdi) +L(bwd_write_42bytes): + lddqu 26(%rsi), %xmm0 + movdqu %xmm0, 26(%rdi) +L(bwd_write_26bytes): + lddqu 10(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 10(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_10bytes): + mov 2(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 2(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_137bytes): + lddqu 121(%rsi), %xmm0 + movdqu %xmm0, 121(%rdi) +L(bwd_write_121bytes): + lddqu 105(%rsi), %xmm0 + movdqu %xmm0, 105(%rdi) +L(bwd_write_105bytes): + lddqu 89(%rsi), %xmm0 + movdqu %xmm0, 89(%rdi) +L(bwd_write_89bytes): + lddqu 73(%rsi), %xmm0 + movdqu %xmm0, 73(%rdi) +L(bwd_write_73bytes): + lddqu 57(%rsi), %xmm0 + movdqu %xmm0, 57(%rdi) +L(bwd_write_57bytes): + lddqu 41(%rsi), %xmm0 + movdqu %xmm0, 41(%rdi) +L(bwd_write_41bytes): + lddqu 25(%rsi), %xmm0 + movdqu %xmm0, 25(%rdi) +L(bwd_write_25bytes): + lddqu 9(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 9(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_9bytes): + mov 1(%rsi), %rdx + mov (%rsi), %rcx + mov %rdx, 1(%rdi) + mov %rcx, (%rdi) + ret + + .p2align 4 +L(bwd_write_136bytes): + lddqu 120(%rsi), %xmm0 + movdqu %xmm0, 120(%rdi) +L(bwd_write_120bytes): + lddqu 104(%rsi), %xmm0 + movdqu %xmm0, 104(%rdi) +L(bwd_write_104bytes): + lddqu 88(%rsi), %xmm0 + movdqu %xmm0, 88(%rdi) +L(bwd_write_88bytes): + lddqu 72(%rsi), %xmm0 + movdqu %xmm0, 72(%rdi) +L(bwd_write_72bytes): + lddqu 56(%rsi), %xmm0 + movdqu %xmm0, 56(%rdi) +L(bwd_write_56bytes): + lddqu 40(%rsi), %xmm0 + movdqu %xmm0, 40(%rdi) +L(bwd_write_40bytes): + lddqu 24(%rsi), %xmm0 + movdqu %xmm0, 24(%rdi) +L(bwd_write_24bytes): + lddqu 8(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 8(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_8bytes): + mov (%rsi), %rdx + mov %rdx, (%rdi) + ret + + .p2align 4 +L(bwd_write_135bytes): + lddqu 119(%rsi), %xmm0 + movdqu %xmm0, 119(%rdi) +L(bwd_write_119bytes): + lddqu 103(%rsi), %xmm0 + movdqu %xmm0, 103(%rdi) +L(bwd_write_103bytes): + lddqu 87(%rsi), %xmm0 + movdqu %xmm0, 87(%rdi) +L(bwd_write_87bytes): + lddqu 71(%rsi), %xmm0 + movdqu %xmm0, 71(%rdi) +L(bwd_write_71bytes): + lddqu 55(%rsi), %xmm0 + movdqu %xmm0, 55(%rdi) +L(bwd_write_55bytes): + lddqu 39(%rsi), %xmm0 + movdqu %xmm0, 39(%rdi) +L(bwd_write_39bytes): + lddqu 23(%rsi), %xmm0 + movdqu %xmm0, 23(%rdi) +L(bwd_write_23bytes): + lddqu 7(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 7(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_7bytes): + mov 3(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 3(%rdi) + mov %ecx, (%rdi) + ret + + .p2align 4 +L(bwd_write_134bytes): + lddqu 118(%rsi), %xmm0 + movdqu %xmm0, 118(%rdi) +L(bwd_write_118bytes): + lddqu 102(%rsi), %xmm0 + movdqu %xmm0, 102(%rdi) +L(bwd_write_102bytes): + lddqu 86(%rsi), %xmm0 + movdqu %xmm0, 86(%rdi) +L(bwd_write_86bytes): + lddqu 70(%rsi), %xmm0 + movdqu %xmm0, 70(%rdi) +L(bwd_write_70bytes): + lddqu 54(%rsi), %xmm0 + movdqu %xmm0, 54(%rdi) +L(bwd_write_54bytes): + lddqu 38(%rsi), %xmm0 + movdqu %xmm0, 38(%rdi) +L(bwd_write_38bytes): + lddqu 22(%rsi), %xmm0 + movdqu %xmm0, 22(%rdi) +L(bwd_write_22bytes): + lddqu 6(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 6(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_6bytes): + mov 2(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 2(%rdi) + mov %ecx, (%rdi) + ret + + .p2align 4 +L(bwd_write_133bytes): + lddqu 117(%rsi), %xmm0 + movdqu %xmm0, 117(%rdi) +L(bwd_write_117bytes): + lddqu 101(%rsi), %xmm0 + movdqu %xmm0, 101(%rdi) +L(bwd_write_101bytes): + lddqu 85(%rsi), %xmm0 + movdqu %xmm0, 85(%rdi) +L(bwd_write_85bytes): + lddqu 69(%rsi), %xmm0 + movdqu %xmm0, 69(%rdi) +L(bwd_write_69bytes): + lddqu 53(%rsi), %xmm0 + movdqu %xmm0, 53(%rdi) +L(bwd_write_53bytes): + lddqu 37(%rsi), %xmm0 + movdqu %xmm0, 37(%rdi) +L(bwd_write_37bytes): + lddqu 21(%rsi), %xmm0 + movdqu %xmm0, 21(%rdi) +L(bwd_write_21bytes): + lddqu 5(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 5(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_5bytes): + mov 1(%rsi), %edx + mov (%rsi), %ecx + mov %edx, 1(%rdi) + mov %ecx, (%rdi) + ret + + .p2align 4 +L(bwd_write_132bytes): + lddqu 116(%rsi), %xmm0 + movdqu %xmm0, 116(%rdi) +L(bwd_write_116bytes): + lddqu 100(%rsi), %xmm0 + movdqu %xmm0, 100(%rdi) +L(bwd_write_100bytes): + lddqu 84(%rsi), %xmm0 + movdqu %xmm0, 84(%rdi) +L(bwd_write_84bytes): + lddqu 68(%rsi), %xmm0 + movdqu %xmm0, 68(%rdi) +L(bwd_write_68bytes): + lddqu 52(%rsi), %xmm0 + movdqu %xmm0, 52(%rdi) +L(bwd_write_52bytes): + lddqu 36(%rsi), %xmm0 + movdqu %xmm0, 36(%rdi) +L(bwd_write_36bytes): + lddqu 20(%rsi), %xmm0 + movdqu %xmm0, 20(%rdi) +L(bwd_write_20bytes): + lddqu 4(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 4(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_4bytes): + mov (%rsi), %edx + mov %edx, (%rdi) + ret + + .p2align 4 +L(bwd_write_131bytes): + lddqu 115(%rsi), %xmm0 + movdqu %xmm0, 115(%rdi) +L(bwd_write_115bytes): + lddqu 99(%rsi), %xmm0 + movdqu %xmm0, 99(%rdi) +L(bwd_write_99bytes): + lddqu 83(%rsi), %xmm0 + movdqu %xmm0, 83(%rdi) +L(bwd_write_83bytes): + lddqu 67(%rsi), %xmm0 + movdqu %xmm0, 67(%rdi) +L(bwd_write_67bytes): + lddqu 51(%rsi), %xmm0 + movdqu %xmm0, 51(%rdi) +L(bwd_write_51bytes): + lddqu 35(%rsi), %xmm0 + movdqu %xmm0, 35(%rdi) +L(bwd_write_35bytes): + lddqu 19(%rsi), %xmm0 + movdqu %xmm0, 19(%rdi) +L(bwd_write_19bytes): + lddqu 3(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 3(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_3bytes): + mov 1(%rsi), %dx + mov (%rsi), %cx + mov %dx, 1(%rdi) + mov %cx, (%rdi) + ret + + .p2align 4 +L(bwd_write_130bytes): + lddqu 114(%rsi), %xmm0 + movdqu %xmm0, 114(%rdi) +L(bwd_write_114bytes): + lddqu 98(%rsi), %xmm0 + movdqu %xmm0, 98(%rdi) +L(bwd_write_98bytes): + lddqu 82(%rsi), %xmm0 + movdqu %xmm0, 82(%rdi) +L(bwd_write_82bytes): + lddqu 66(%rsi), %xmm0 + movdqu %xmm0, 66(%rdi) +L(bwd_write_66bytes): + lddqu 50(%rsi), %xmm0 + movdqu %xmm0, 50(%rdi) +L(bwd_write_50bytes): + lddqu 34(%rsi), %xmm0 + movdqu %xmm0, 34(%rdi) +L(bwd_write_34bytes): + lddqu 18(%rsi), %xmm0 + movdqu %xmm0, 18(%rdi) +L(bwd_write_18bytes): + lddqu 2(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 2(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_2bytes): + movzwl (%rsi), %edx + mov %dx, (%rdi) + ret + + .p2align 4 +L(bwd_write_129bytes): + lddqu 113(%rsi), %xmm0 + movdqu %xmm0, 113(%rdi) +L(bwd_write_113bytes): + lddqu 97(%rsi), %xmm0 + movdqu %xmm0, 97(%rdi) +L(bwd_write_97bytes): + lddqu 81(%rsi), %xmm0 + movdqu %xmm0, 81(%rdi) +L(bwd_write_81bytes): + lddqu 65(%rsi), %xmm0 + movdqu %xmm0, 65(%rdi) +L(bwd_write_65bytes): + lddqu 49(%rsi), %xmm0 + movdqu %xmm0, 49(%rdi) +L(bwd_write_49bytes): + lddqu 33(%rsi), %xmm0 + movdqu %xmm0, 33(%rdi) +L(bwd_write_33bytes): + lddqu 17(%rsi), %xmm0 + movdqu %xmm0, 17(%rdi) +L(bwd_write_17bytes): + lddqu 1(%rsi), %xmm0 + lddqu (%rsi), %xmm1 + movdqu %xmm0, 1(%rdi) + movdqu %xmm1, (%rdi) + ret + + .p2align 4 +L(bwd_write_1bytes): + movzbl (%rsi), %edx + mov %dl, (%rdi) + ret + +END (MEMCPY) + + .section .rodata.ssse3,"a",@progbits + .p2align 3 +L(table_144_bytes_bwd): + .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) + .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) + + .p2align 3 +L(table_144_bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) + .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) + + .p2align 3 +L(shl_table_fwd): + .int JMPTBL (L(shl_0), L(shl_table_fwd)) + .int JMPTBL (L(shl_1), L(shl_table_fwd)) + .int JMPTBL (L(shl_2), L(shl_table_fwd)) + .int JMPTBL (L(shl_3), L(shl_table_fwd)) + .int JMPTBL (L(shl_4), L(shl_table_fwd)) + .int JMPTBL (L(shl_5), L(shl_table_fwd)) + .int JMPTBL (L(shl_6), L(shl_table_fwd)) + .int JMPTBL (L(shl_7), L(shl_table_fwd)) + .int JMPTBL (L(shl_8), L(shl_table_fwd)) + .int JMPTBL (L(shl_9), L(shl_table_fwd)) + .int JMPTBL (L(shl_10), L(shl_table_fwd)) + .int JMPTBL (L(shl_11), L(shl_table_fwd)) + .int JMPTBL (L(shl_12), L(shl_table_fwd)) + .int JMPTBL (L(shl_13), L(shl_table_fwd)) + .int JMPTBL (L(shl_14), L(shl_table_fwd)) + .int JMPTBL (L(shl_15), L(shl_table_fwd)) + + .p2align 3 +L(shl_table_bwd): + .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S new file mode 100644 index 0000000000..f3ea52a46c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy-ssse3.S @@ -0,0 +1,3150 @@ +/* memcpy with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# define MEMPCPY __mempcpy_ssse3 +# define MEMPCPY_CHK __mempcpy_chk_ssse3 +#endif + +#define JMPTBL(I, B) I - B + +/* Branch to an entry in a jump table. TABLE is a jump table with + relative offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), INDEX; \ + lea (%r11, INDEX), INDEX; \ + jmp *INDEX; \ + ud2 + + .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE +ENTRY (MEMPCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMPCPY_CHK) + +ENTRY (MEMPCPY) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMPCPY) +#endif + +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif + +ENTRY (MEMCPY) + mov %rdi, %rax +#ifdef USE_AS_MEMPCPY + add %rdx, %rax +#endif + +#ifdef USE_AS_MEMMOVE + cmp %rsi, %rdi + jb L(copy_forward) + je L(write_0bytes) + cmp $79, %rdx + jbe L(copy_forward) + jmp L(copy_backward) +L(copy_forward): +#endif +L(start): + cmp $79, %rdx + lea L(table_less_80bytes)(%rip), %r11 + ja L(80bytesormore) + movslq (%r11, %rdx, 4), %r9 + add %rdx, %rsi + add %rdx, %rdi + add %r11, %r9 + jmp *%r9 + ud2 + + .p2align 4 +L(80bytesormore): +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jle L(copy_backward) +#endif + + movdqu (%rsi), %xmm0 + mov %rdi, %rcx + and $-16, %rdi + add $16, %rdi + mov %rcx, %r8 + sub %rdi, %rcx + add %rcx, %rdx + sub %rcx, %rsi + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif + cmp %rcx, %rdx + mov %rsi, %r9 + ja L(large_page_fwd) + and $0xf, %r9 + jz L(shl_0) +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_data_cache_size_half(%rip), %RCX_LP +#endif + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) + + .p2align 4 +L(copy_backward): + movdqu -16(%rsi, %rdx), %xmm0 + add %rdx, %rsi + lea -16(%rdi, %rdx), %r8 + add %rdx, %rdi + + mov %rdi, %rcx + and $0xf, %rcx + xor %rcx, %rdi + sub %rcx, %rdx + sub %rcx, %rsi + +#ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_shared_cache_size_half(%rip), %RCX_LP +#endif + + cmp %rcx, %rdx + mov %rsi, %r9 + ja L(large_page_bwd) + and $0xf, %r9 + jz L(shl_0_bwd) +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %RCX_LP +#else + mov __x86_data_cache_size_half(%rip), %RCX_LP +#endif + BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) + + .p2align 4 +L(shl_0): + sub $16, %rdx + movdqa (%rsi), %xmm1 + add $16, %rsi + movdqa %xmm1, (%rdi) + add $16, %rdi + cmp $128, %rdx + movdqu %xmm0, (%r8) + ja L(shl_0_gobble) + cmp $64, %rdx + jb L(shl_0_less_64bytes) + movaps (%rsi), %xmm4 + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps %xmm4, (%rdi) + movaps %xmm1, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm3, 48(%rdi) + sub $64, %rdx + add $64, %rsi + add $64, %rdi +L(shl_0_less_64bytes): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble): +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %RDX_LP +#else + cmp __x86_data_cache_size_half(%rip), %RDX_LP +#endif + lea -128(%rdx), %rdx + jae L(shl_0_gobble_mem_loop) +L(shl_0_gobble_cache_loop): + movdqa (%rsi), %xmm4 + movaps 0x10(%rsi), %xmm1 + movaps 0x20(%rsi), %xmm2 + movaps 0x30(%rsi), %xmm3 + + movdqa %xmm4, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + + sub $128, %rdx + movaps 0x40(%rsi), %xmm4 + movaps 0x50(%rsi), %xmm5 + movaps 0x60(%rsi), %xmm6 + movaps 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + movaps %xmm4, 0x40(%rdi) + movaps %xmm5, 0x50(%rdi) + movaps %xmm6, 0x60(%rdi) + movaps %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + + jae L(shl_0_gobble_cache_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_cache_less_64bytes) + + movdqa (%rsi), %xmm4 + sub $0x40, %rdx + movdqa 0x10(%rsi), %xmm1 + + movdqa %xmm4, (%rdi) + movdqa %xmm1, 0x10(%rdi) + + movdqa 0x20(%rsi), %xmm4 + movdqa 0x30(%rsi), %xmm1 + add $0x40, %rsi + + movdqa %xmm4, 0x20(%rdi) + movdqa %xmm1, 0x30(%rdi) + add $0x40, %rdi +L(shl_0_cache_less_64bytes): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble_mem_loop): + prefetcht0 0x1c0(%rsi) + prefetcht0 0x280(%rsi) + + movdqa (%rsi), %xmm0 + movdqa 0x10(%rsi), %xmm1 + movdqa 0x20(%rsi), %xmm2 + movdqa 0x30(%rsi), %xmm3 + movdqa 0x40(%rsi), %xmm4 + movdqa 0x50(%rsi), %xmm5 + movdqa 0x60(%rsi), %xmm6 + movdqa 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + sub $0x80, %rdx + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + movdqa %xmm2, 0x20(%rdi) + movdqa %xmm3, 0x30(%rdi) + movdqa %xmm4, 0x40(%rdi) + movdqa %xmm5, 0x50(%rdi) + movdqa %xmm6, 0x60(%rdi) + movdqa %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + + jae L(shl_0_gobble_mem_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_mem_less_64bytes) + + movdqa (%rsi), %xmm0 + sub $0x40, %rdx + movdqa 0x10(%rsi), %xmm1 + + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + + movdqa 0x20(%rsi), %xmm0 + movdqa 0x30(%rsi), %xmm1 + add $0x40, %rsi + + movdqa %xmm0, 0x20(%rdi) + movdqa %xmm1, 0x30(%rdi) + add $0x40, %rdi +L(shl_0_mem_less_64bytes): + cmp $0x20, %rdx + jb L(shl_0_mem_less_32bytes) + movdqa (%rsi), %xmm0 + sub $0x20, %rdx + movdqa 0x10(%rsi), %xmm1 + add $0x20, %rsi + movdqa %xmm0, (%rdi) + movdqa %xmm1, 0x10(%rdi) + add $0x20, %rdi +L(shl_0_mem_less_32bytes): + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_bwd): + sub $16, %rdx + movdqa -0x10(%rsi), %xmm1 + sub $16, %rsi + movdqa %xmm1, -0x10(%rdi) + sub $16, %rdi + cmp $0x80, %rdx + movdqu %xmm0, (%r8) + ja L(shl_0_gobble_bwd) + cmp $64, %rdx + jb L(shl_0_less_64bytes_bwd) + movaps -0x10(%rsi), %xmm0 + movaps -0x20(%rsi), %xmm1 + movaps -0x30(%rsi), %xmm2 + movaps -0x40(%rsi), %xmm3 + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + sub $64, %rdx + sub $0x40, %rsi + sub $0x40, %rdi +L(shl_0_less_64bytes_bwd): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble_bwd): +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %RDX_LP +#else + cmp __x86_data_cache_size_half(%rip), %RDX_LP +#endif + lea -128(%rdx), %rdx + jae L(shl_0_gobble_mem_bwd_loop) +L(shl_0_gobble_bwd_loop): + movdqa -0x10(%rsi), %xmm0 + movaps -0x20(%rsi), %xmm1 + movaps -0x30(%rsi), %xmm2 + movaps -0x40(%rsi), %xmm3 + + movdqa %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + + sub $0x80, %rdx + movaps -0x50(%rsi), %xmm4 + movaps -0x60(%rsi), %xmm5 + movaps -0x70(%rsi), %xmm6 + movaps -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + movaps %xmm4, -0x50(%rdi) + movaps %xmm5, -0x60(%rdi) + movaps %xmm6, -0x70(%rdi) + movaps %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + + jae L(shl_0_gobble_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_gobble_bwd_less_64bytes) + + movdqa -0x10(%rsi), %xmm0 + sub $0x40, %rdx + movdqa -0x20(%rsi), %xmm1 + + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + + movdqa -0x30(%rsi), %xmm0 + movdqa -0x40(%rsi), %xmm1 + sub $0x40, %rsi + + movdqa %xmm0, -0x30(%rdi) + movdqa %xmm1, -0x40(%rdi) + sub $0x40, %rdi +L(shl_0_gobble_bwd_less_64bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_0_gobble_mem_bwd_loop): + prefetcht0 -0x1c0(%rsi) + prefetcht0 -0x280(%rsi) + movdqa -0x10(%rsi), %xmm0 + movdqa -0x20(%rsi), %xmm1 + movdqa -0x30(%rsi), %xmm2 + movdqa -0x40(%rsi), %xmm3 + movdqa -0x50(%rsi), %xmm4 + movdqa -0x60(%rsi), %xmm5 + movdqa -0x70(%rsi), %xmm6 + movdqa -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + sub $0x80, %rdx + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + movdqa %xmm2, -0x30(%rdi) + movdqa %xmm3, -0x40(%rdi) + movdqa %xmm4, -0x50(%rdi) + movdqa %xmm5, -0x60(%rdi) + movdqa %xmm6, -0x70(%rdi) + movdqa %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + + jae L(shl_0_gobble_mem_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(shl_0_mem_bwd_less_64bytes) + + movdqa -0x10(%rsi), %xmm0 + sub $0x40, %rdx + movdqa -0x20(%rsi), %xmm1 + + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + + movdqa -0x30(%rsi), %xmm0 + movdqa -0x40(%rsi), %xmm1 + sub $0x40, %rsi + + movdqa %xmm0, -0x30(%rdi) + movdqa %xmm1, -0x40(%rdi) + sub $0x40, %rdi +L(shl_0_mem_bwd_less_64bytes): + cmp $0x20, %rdx + jb L(shl_0_mem_bwd_less_32bytes) + movdqa -0x10(%rsi), %xmm0 + sub $0x20, %rdx + movdqa -0x20(%rsi), %xmm1 + sub $0x20, %rsi + movdqa %xmm0, -0x10(%rdi) + movdqa %xmm1, -0x20(%rdi) + sub $0x20, %rdi +L(shl_0_mem_bwd_less_32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_1): + lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x01(%rsi), %xmm1 + jb L(L1_fwd) + lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 +L(L1_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_1_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_1_loop_L1): + sub $64, %rdx + movaps 0x0f(%rsi), %xmm2 + movaps 0x1f(%rsi), %xmm3 + movaps 0x2f(%rsi), %xmm4 + movaps 0x3f(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $1, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $1, %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $1, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_1_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_1_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_1_bwd): + lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x01(%rsi), %xmm1 + jb L(L1_bwd) + lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 +L(L1_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_1_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_1_bwd_loop_L1): + movaps -0x11(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x21(%rsi), %xmm3 + movaps -0x31(%rsi), %xmm4 + movaps -0x41(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $1, %xmm2, %xmm1 + palignr $1, %xmm3, %xmm2 + palignr $1, %xmm4, %xmm3 + palignr $1, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_1_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_1_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_2): + lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x02(%rsi), %xmm1 + jb L(L2_fwd) + lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 +L(L2_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_2_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_2_loop_L1): + sub $64, %rdx + movaps 0x0e(%rsi), %xmm2 + movaps 0x1e(%rsi), %xmm3 + movaps 0x2e(%rsi), %xmm4 + movaps 0x3e(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $2, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $2, %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $2, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_2_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_2_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_2_bwd): + lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x02(%rsi), %xmm1 + jb L(L2_bwd) + lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 +L(L2_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_2_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_2_bwd_loop_L1): + movaps -0x12(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x22(%rsi), %xmm3 + movaps -0x32(%rsi), %xmm4 + movaps -0x42(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $2, %xmm2, %xmm1 + palignr $2, %xmm3, %xmm2 + palignr $2, %xmm4, %xmm3 + palignr $2, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_2_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_2_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_3): + lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x03(%rsi), %xmm1 + jb L(L3_fwd) + lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 +L(L3_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_3_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_3_loop_L1): + sub $64, %rdx + movaps 0x0d(%rsi), %xmm2 + movaps 0x1d(%rsi), %xmm3 + movaps 0x2d(%rsi), %xmm4 + movaps 0x3d(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $3, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $3, %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $3, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_3_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_3_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_3_bwd): + lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x03(%rsi), %xmm1 + jb L(L3_bwd) + lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 +L(L3_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_3_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_3_bwd_loop_L1): + movaps -0x13(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x23(%rsi), %xmm3 + movaps -0x33(%rsi), %xmm4 + movaps -0x43(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $3, %xmm2, %xmm1 + palignr $3, %xmm3, %xmm2 + palignr $3, %xmm4, %xmm3 + palignr $3, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_3_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_3_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_4): + lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x04(%rsi), %xmm1 + jb L(L4_fwd) + lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 +L(L4_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_4_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_4_loop_L1): + sub $64, %rdx + movaps 0x0c(%rsi), %xmm2 + movaps 0x1c(%rsi), %xmm3 + movaps 0x2c(%rsi), %xmm4 + movaps 0x3c(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $4, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $4, %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $4, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_4_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_4_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_4_bwd): + lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x04(%rsi), %xmm1 + jb L(L4_bwd) + lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 +L(L4_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_4_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_4_bwd_loop_L1): + movaps -0x14(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x24(%rsi), %xmm3 + movaps -0x34(%rsi), %xmm4 + movaps -0x44(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $4, %xmm2, %xmm1 + palignr $4, %xmm3, %xmm2 + palignr $4, %xmm4, %xmm3 + palignr $4, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_4_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_4_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_5): + lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x05(%rsi), %xmm1 + jb L(L5_fwd) + lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 +L(L5_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_5_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_5_loop_L1): + sub $64, %rdx + movaps 0x0b(%rsi), %xmm2 + movaps 0x1b(%rsi), %xmm3 + movaps 0x2b(%rsi), %xmm4 + movaps 0x3b(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $5, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $5, %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $5, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_5_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_5_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_5_bwd): + lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x05(%rsi), %xmm1 + jb L(L5_bwd) + lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 +L(L5_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_5_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_5_bwd_loop_L1): + movaps -0x15(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x25(%rsi), %xmm3 + movaps -0x35(%rsi), %xmm4 + movaps -0x45(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $5, %xmm2, %xmm1 + palignr $5, %xmm3, %xmm2 + palignr $5, %xmm4, %xmm3 + palignr $5, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_5_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_5_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_6): + lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x06(%rsi), %xmm1 + jb L(L6_fwd) + lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 +L(L6_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_6_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_6_loop_L1): + sub $64, %rdx + movaps 0x0a(%rsi), %xmm2 + movaps 0x1a(%rsi), %xmm3 + movaps 0x2a(%rsi), %xmm4 + movaps 0x3a(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $6, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $6, %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $6, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_6_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_6_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_6_bwd): + lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x06(%rsi), %xmm1 + jb L(L6_bwd) + lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 +L(L6_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_6_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_6_bwd_loop_L1): + movaps -0x16(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x26(%rsi), %xmm3 + movaps -0x36(%rsi), %xmm4 + movaps -0x46(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $6, %xmm2, %xmm1 + palignr $6, %xmm3, %xmm2 + palignr $6, %xmm4, %xmm3 + palignr $6, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_6_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_6_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_7): + lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x07(%rsi), %xmm1 + jb L(L7_fwd) + lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 +L(L7_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_7_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_7_loop_L1): + sub $64, %rdx + movaps 0x09(%rsi), %xmm2 + movaps 0x19(%rsi), %xmm3 + movaps 0x29(%rsi), %xmm4 + movaps 0x39(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $7, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $7, %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $7, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_7_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_7_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_7_bwd): + lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x07(%rsi), %xmm1 + jb L(L7_bwd) + lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 +L(L7_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_7_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_7_bwd_loop_L1): + movaps -0x17(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x27(%rsi), %xmm3 + movaps -0x37(%rsi), %xmm4 + movaps -0x47(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $7, %xmm2, %xmm1 + palignr $7, %xmm3, %xmm2 + palignr $7, %xmm4, %xmm3 + palignr $7, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_7_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_7_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_8): + lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x08(%rsi), %xmm1 + jb L(L8_fwd) + lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 +L(L8_fwd): + lea -64(%rdx), %rdx + jmp *%r9 +L(shl_8_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_8_loop_L1): + sub $64, %rdx + movaps 0x08(%rsi), %xmm2 + movaps 0x18(%rsi), %xmm3 + movaps 0x28(%rsi), %xmm4 + movaps 0x38(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $8, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $8, %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $8, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_8_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 + .p2align 4 +L(shl_8_end): + lea 64(%rdx), %rdx + movaps %xmm4, -0x20(%rdi) + add %rdx, %rsi + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_8_bwd): + lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x08(%rsi), %xmm1 + jb L(L8_bwd) + lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 +L(L8_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_8_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_8_bwd_loop_L1): + movaps -0x18(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x28(%rsi), %xmm3 + movaps -0x38(%rsi), %xmm4 + movaps -0x48(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $8, %xmm2, %xmm1 + palignr $8, %xmm3, %xmm2 + palignr $8, %xmm4, %xmm3 + palignr $8, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_8_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_8_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_9): + lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x09(%rsi), %xmm1 + jb L(L9_fwd) + lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 +L(L9_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_9_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_9_loop_L1): + sub $64, %rdx + movaps 0x07(%rsi), %xmm2 + movaps 0x17(%rsi), %xmm3 + movaps 0x27(%rsi), %xmm4 + movaps 0x37(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $9, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $9, %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $9, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_9_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_9_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_9_bwd): + lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x09(%rsi), %xmm1 + jb L(L9_bwd) + lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 +L(L9_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_9_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_9_bwd_loop_L1): + movaps -0x19(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x29(%rsi), %xmm3 + movaps -0x39(%rsi), %xmm4 + movaps -0x49(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $9, %xmm2, %xmm1 + palignr $9, %xmm3, %xmm2 + palignr $9, %xmm4, %xmm3 + palignr $9, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_9_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_9_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_10): + lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0a(%rsi), %xmm1 + jb L(L10_fwd) + lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 +L(L10_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_10_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_10_loop_L1): + sub $64, %rdx + movaps 0x06(%rsi), %xmm2 + movaps 0x16(%rsi), %xmm3 + movaps 0x26(%rsi), %xmm4 + movaps 0x36(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $10, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $10, %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $10, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_10_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_10_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_10_bwd): + lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0a(%rsi), %xmm1 + jb L(L10_bwd) + lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 +L(L10_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_10_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_10_bwd_loop_L1): + movaps -0x1a(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2a(%rsi), %xmm3 + movaps -0x3a(%rsi), %xmm4 + movaps -0x4a(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $10, %xmm2, %xmm1 + palignr $10, %xmm3, %xmm2 + palignr $10, %xmm4, %xmm3 + palignr $10, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_10_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_10_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_11): + lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0b(%rsi), %xmm1 + jb L(L11_fwd) + lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 +L(L11_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_11_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_11_loop_L1): + sub $64, %rdx + movaps 0x05(%rsi), %xmm2 + movaps 0x15(%rsi), %xmm3 + movaps 0x25(%rsi), %xmm4 + movaps 0x35(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $11, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $11, %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $11, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_11_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_11_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_11_bwd): + lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0b(%rsi), %xmm1 + jb L(L11_bwd) + lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 +L(L11_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_11_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_11_bwd_loop_L1): + movaps -0x1b(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2b(%rsi), %xmm3 + movaps -0x3b(%rsi), %xmm4 + movaps -0x4b(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $11, %xmm2, %xmm1 + palignr $11, %xmm3, %xmm2 + palignr $11, %xmm4, %xmm3 + palignr $11, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_11_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_11_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_12): + lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0c(%rsi), %xmm1 + jb L(L12_fwd) + lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 +L(L12_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_12_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_12_loop_L1): + sub $64, %rdx + movaps 0x04(%rsi), %xmm2 + movaps 0x14(%rsi), %xmm3 + movaps 0x24(%rsi), %xmm4 + movaps 0x34(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $12, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $12, %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $12, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_12_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_12_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_12_bwd): + lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0c(%rsi), %xmm1 + jb L(L12_bwd) + lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 +L(L12_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_12_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_12_bwd_loop_L1): + movaps -0x1c(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2c(%rsi), %xmm3 + movaps -0x3c(%rsi), %xmm4 + movaps -0x4c(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $12, %xmm2, %xmm1 + palignr $12, %xmm3, %xmm2 + palignr $12, %xmm4, %xmm3 + palignr $12, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_12_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_12_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_13): + lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0d(%rsi), %xmm1 + jb L(L13_fwd) + lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 +L(L13_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_13_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_13_loop_L1): + sub $64, %rdx + movaps 0x03(%rsi), %xmm2 + movaps 0x13(%rsi), %xmm3 + movaps 0x23(%rsi), %xmm4 + movaps 0x33(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $13, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $13, %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $13, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_13_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_13_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_13_bwd): + lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0d(%rsi), %xmm1 + jb L(L13_bwd) + lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 +L(L13_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_13_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_13_bwd_loop_L1): + movaps -0x1d(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2d(%rsi), %xmm3 + movaps -0x3d(%rsi), %xmm4 + movaps -0x4d(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $13, %xmm2, %xmm1 + palignr $13, %xmm3, %xmm2 + palignr $13, %xmm4, %xmm3 + palignr $13, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_13_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_13_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_14): + lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0e(%rsi), %xmm1 + jb L(L14_fwd) + lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 +L(L14_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_14_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_14_loop_L1): + sub $64, %rdx + movaps 0x02(%rsi), %xmm2 + movaps 0x12(%rsi), %xmm3 + movaps 0x22(%rsi), %xmm4 + movaps 0x32(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $14, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $14, %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $14, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_14_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_14_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_14_bwd): + lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0e(%rsi), %xmm1 + jb L(L14_bwd) + lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 +L(L14_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_14_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_14_bwd_loop_L1): + movaps -0x1e(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2e(%rsi), %xmm3 + movaps -0x3e(%rsi), %xmm4 + movaps -0x4e(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $14, %xmm2, %xmm1 + palignr $14, %xmm3, %xmm2 + palignr $14, %xmm4, %xmm3 + palignr $14, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_14_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_14_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_15): + lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0f(%rsi), %xmm1 + jb L(L15_fwd) + lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 +L(L15_fwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_15_loop_L2): + prefetchnta 0x1c0(%rsi) +L(shl_15_loop_L1): + sub $64, %rdx + movaps 0x01(%rsi), %xmm2 + movaps 0x11(%rsi), %xmm3 + movaps 0x21(%rsi), %xmm4 + movaps 0x31(%rsi), %xmm5 + movdqa %xmm5, %xmm6 + palignr $15, %xmm4, %xmm5 + lea 64(%rsi), %rsi + palignr $15, %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + lea 64(%rdi), %rdi + palignr $15, %xmm1, %xmm2 + movdqa %xmm6, %xmm1 + movdqa %xmm2, -0x40(%rdi) + movaps %xmm3, -0x30(%rdi) + jb L(shl_15_end) + movaps %xmm4, -0x20(%rdi) + movaps %xmm5, -0x10(%rdi) + jmp *%r9 + ud2 +L(shl_15_end): + movaps %xmm4, -0x20(%rdi) + lea 64(%rdx), %rdx + movaps %xmm5, -0x10(%rdi) + add %rdx, %rdi + movdqu %xmm0, (%r8) + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(shl_15_bwd): + lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 + cmp %rcx, %rdx + movaps -0x0f(%rsi), %xmm1 + jb L(L15_bwd) + lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 +L(L15_bwd): + lea -64(%rdx), %rdx + jmp *%r9 + ud2 +L(shl_15_bwd_loop_L2): + prefetchnta -0x1c0(%rsi) +L(shl_15_bwd_loop_L1): + movaps -0x1f(%rsi), %xmm2 + sub $0x40, %rdx + movaps -0x2f(%rsi), %xmm3 + movaps -0x3f(%rsi), %xmm4 + movaps -0x4f(%rsi), %xmm5 + lea -0x40(%rsi), %rsi + palignr $15, %xmm2, %xmm1 + palignr $15, %xmm3, %xmm2 + palignr $15, %xmm4, %xmm3 + palignr $15, %xmm5, %xmm4 + + movaps %xmm1, -0x10(%rdi) + movaps %xmm5, %xmm1 + + movaps %xmm2, -0x20(%rdi) + lea -0x40(%rdi), %rdi + + movaps %xmm3, 0x10(%rdi) + jb L(shl_15_bwd_end) + movaps %xmm4, (%rdi) + jmp *%r9 + ud2 +L(shl_15_bwd_end): + movaps %xmm4, (%rdi) + lea 64(%rdx), %rdx + movdqu %xmm0, (%r8) + BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) + + .p2align 4 +L(write_72bytes): + movdqu -72(%rsi), %xmm0 + movdqu -56(%rsi), %xmm1 + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rcx + movdqu %xmm0, -72(%rdi) + movdqu %xmm1, -56(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(write_64bytes): + movdqu -64(%rsi), %xmm0 + mov -48(%rsi), %rcx + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + movdqu %xmm0, -64(%rdi) + mov %rcx, -48(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_56bytes): + movdqu -56(%rsi), %xmm0 + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rcx + movdqu %xmm0, -56(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rcx, -8(%rdi) + ret + + .p2align 4 +L(write_48bytes): + mov -48(%rsi), %rcx + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %rcx, -48(%rdi) + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_40bytes): + mov -40(%rsi), %r8 + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r8, -40(%rdi) + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_32bytes): + mov -32(%rsi), %r9 + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r9, -32(%rdi) + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_24bytes): + mov -24(%rsi), %r10 + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r10, -24(%rdi) + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_16bytes): + mov -16(%rsi), %r11 + mov -8(%rsi), %rdx + mov %r11, -16(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_8bytes): + mov -8(%rsi), %rdx + mov %rdx, -8(%rdi) +L(write_0bytes): + ret + + .p2align 4 +L(write_73bytes): + movdqu -73(%rsi), %xmm0 + movdqu -57(%rsi), %xmm1 + mov -41(%rsi), %rcx + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %r8 + mov -4(%rsi), %edx + movdqu %xmm0, -73(%rdi) + movdqu %xmm1, -57(%rdi) + mov %rcx, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %r8, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_65bytes): + movdqu -65(%rsi), %xmm0 + movdqu -49(%rsi), %xmm1 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -65(%rdi) + movdqu %xmm1, -49(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_57bytes): + movdqu -57(%rsi), %xmm0 + mov -41(%rsi), %r8 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -57(%rdi) + mov %r8, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_49bytes): + movdqu -49(%rsi), %xmm0 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -49(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_41bytes): + mov -41(%rsi), %r8 + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r8, -41(%rdi) + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_33bytes): + mov -33(%rsi), %r9 + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r9, -33(%rdi) + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_25bytes): + mov -25(%rsi), %r10 + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -1(%rsi), %dl + mov %r10, -25(%rdi) + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_17bytes): + mov -17(%rsi), %r11 + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -17(%rdi) + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_9bytes): + mov -9(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -9(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_1bytes): + mov -1(%rsi), %dl + mov %dl, -1(%rdi) + ret + + .p2align 4 +L(write_74bytes): + movdqu -74(%rsi), %xmm0 + movdqu -58(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -74(%rdi) + movdqu %xmm1, -58(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_66bytes): + movdqu -66(%rsi), %xmm0 + movdqu -50(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -66(%rdi) + movdqu %xmm1, -50(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_58bytes): + movdqu -58(%rsi), %xmm1 + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm1, -58(%rdi) + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_50bytes): + movdqu -50(%rsi), %xmm0 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -50(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_42bytes): + mov -42(%rsi), %r8 + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -42(%rdi) + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_34bytes): + mov -34(%rsi), %r9 + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -34(%rdi) + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_26bytes): + mov -26(%rsi), %r10 + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -26(%rdi) + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_18bytes): + mov -18(%rsi), %r11 + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -18(%rdi) + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_10bytes): + mov -10(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -10(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_2bytes): + mov -2(%rsi), %dx + mov %dx, -2(%rdi) + ret + + .p2align 4 +L(write_75bytes): + movdqu -75(%rsi), %xmm0 + movdqu -59(%rsi), %xmm1 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -75(%rdi) + movdqu %xmm1, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_67bytes): + movdqu -67(%rsi), %xmm0 + movdqu -59(%rsi), %xmm1 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -67(%rdi) + movdqu %xmm1, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_59bytes): + movdqu -59(%rsi), %xmm0 + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -59(%rdi) + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_51bytes): + movdqu -51(%rsi), %xmm0 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -51(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_43bytes): + mov -43(%rsi), %r8 + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -43(%rdi) + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_35bytes): + mov -35(%rsi), %r9 + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -35(%rdi) + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_27bytes): + mov -27(%rsi), %r10 + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -27(%rdi) + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_19bytes): + mov -19(%rsi), %r11 + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -19(%rdi) + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_11bytes): + mov -11(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -11(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_3bytes): + mov -3(%rsi), %dx + mov -2(%rsi), %cx + mov %dx, -3(%rdi) + mov %cx, -2(%rdi) + ret + + .p2align 4 +L(write_76bytes): + movdqu -76(%rsi), %xmm0 + movdqu -60(%rsi), %xmm1 + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -76(%rdi) + movdqu %xmm1, -60(%rdi) + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_68bytes): + movdqu -68(%rsi), %xmm0 + movdqu -52(%rsi), %xmm1 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -68(%rdi) + movdqu %xmm1, -52(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_60bytes): + movdqu -60(%rsi), %xmm0 + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -60(%rdi) + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_52bytes): + movdqu -52(%rsi), %xmm0 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + movdqu %xmm0, -52(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_44bytes): + mov -44(%rsi), %r8 + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r8, -44(%rdi) + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_36bytes): + mov -36(%rsi), %r9 + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r9, -36(%rdi) + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_28bytes): + mov -28(%rsi), %r10 + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r10, -28(%rdi) + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_20bytes): + mov -20(%rsi), %r11 + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %r11, -20(%rdi) + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_12bytes): + mov -12(%rsi), %rcx + mov -4(%rsi), %edx + mov %rcx, -12(%rdi) + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_4bytes): + mov -4(%rsi), %edx + mov %edx, -4(%rdi) + ret + + .p2align 4 +L(write_77bytes): + movdqu -77(%rsi), %xmm0 + movdqu -61(%rsi), %xmm1 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -77(%rdi) + movdqu %xmm1, -61(%rdi) + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_69bytes): + movdqu -69(%rsi), %xmm0 + movdqu -53(%rsi), %xmm1 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -69(%rdi) + movdqu %xmm1, -53(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_61bytes): + movdqu -61(%rsi), %xmm0 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -61(%rdi) + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_53bytes): + movdqu -53(%rsi), %xmm0 + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -53(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_45bytes): + mov -45(%rsi), %r8 + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -45(%rdi) + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_37bytes): + mov -37(%rsi), %r9 + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -37(%rdi) + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_29bytes): + mov -29(%rsi), %r10 + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -29(%rdi) + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_21bytes): + mov -21(%rsi), %r11 + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -21(%rdi) + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_13bytes): + mov -13(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -13(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_5bytes): + mov -5(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -5(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(write_78bytes): + movdqu -78(%rsi), %xmm0 + movdqu -62(%rsi), %xmm1 + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -78(%rdi) + movdqu %xmm1, -62(%rdi) + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_70bytes): + movdqu -70(%rsi), %xmm0 + movdqu -54(%rsi), %xmm1 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -70(%rdi) + movdqu %xmm1, -54(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_62bytes): + movdqu -62(%rsi), %xmm0 + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -62(%rdi) + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_54bytes): + movdqu -54(%rsi), %xmm0 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -54(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_46bytes): + mov -46(%rsi), %r8 + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -46(%rdi) + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_38bytes): + mov -38(%rsi), %r9 + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -38(%rdi) + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_30bytes): + mov -30(%rsi), %r10 + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -30(%rdi) + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_22bytes): + mov -22(%rsi), %r11 + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -22(%rdi) + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_14bytes): + mov -14(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -14(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_6bytes): + mov -6(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -6(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(write_79bytes): + movdqu -79(%rsi), %xmm0 + movdqu -63(%rsi), %xmm1 + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -79(%rdi) + movdqu %xmm1, -63(%rdi) + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_71bytes): + movdqu -71(%rsi), %xmm0 + movdqu -55(%rsi), %xmm1 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -71(%rdi) + movdqu %xmm1, -55(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_63bytes): + movdqu -63(%rsi), %xmm0 + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -63(%rdi) + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_55bytes): + movdqu -55(%rsi), %xmm0 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + movdqu %xmm0, -55(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_47bytes): + mov -47(%rsi), %r8 + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r8, -47(%rdi) + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_39bytes): + mov -39(%rsi), %r9 + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r9, -39(%rdi) + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_31bytes): + mov -31(%rsi), %r10 + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r10, -31(%rdi) + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_23bytes): + mov -23(%rsi), %r11 + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %r11, -23(%rdi) + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_15bytes): + mov -15(%rsi), %rcx + mov -8(%rsi), %rdx + mov %rcx, -15(%rdi) + mov %rdx, -8(%rdi) + ret + + .p2align 4 +L(write_7bytes): + mov -7(%rsi), %edx + mov -4(%rsi), %ecx + mov %edx, -7(%rdi) + mov %ecx, -4(%rdi) + ret + + .p2align 4 +L(large_page_fwd): + movdqu (%rsi), %xmm1 + lea 16(%rsi), %rsi + movdqu %xmm0, (%r8) + movntdq %xmm1, (%rdi) + lea 16(%rdi), %rdi + lea -0x90(%rdx), %rdx +#ifdef USE_AS_MEMMOVE + mov %rsi, %r9 + sub %rdi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_fwd) + shl $2, %rcx + cmp %rcx, %rdx + jb L(ll_cache_copy_fwd_start) +L(memmove_is_memcpy_fwd): +#endif +L(large_page_loop): + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + + sub $0x80, %rdx + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + movntdq %xmm4, 0x40(%rdi) + movntdq %xmm5, 0x50(%rdi) + movntdq %xmm6, 0x60(%rdi) + movntdq %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + jae L(large_page_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_less_64bytes) + + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + lea 0x40(%rsi), %rsi + + movntdq %xmm0, (%rdi) + movntdq %xmm1, 0x10(%rdi) + movntdq %xmm2, 0x20(%rdi) + movntdq %xmm3, 0x30(%rdi) + lea 0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_less_64bytes): + add %rdx, %rsi + add %rdx, %rdi + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + +#ifdef USE_AS_MEMMOVE + .p2align 4 +L(ll_cache_copy_fwd_start): + prefetcht0 0x1c0(%rsi) + prefetcht0 0x200(%rsi) + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + movdqu 0x40(%rsi), %xmm4 + movdqu 0x50(%rsi), %xmm5 + movdqu 0x60(%rsi), %xmm6 + movdqu 0x70(%rsi), %xmm7 + lea 0x80(%rsi), %rsi + + sub $0x80, %rdx + movaps %xmm0, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + movaps %xmm4, 0x40(%rdi) + movaps %xmm5, 0x50(%rdi) + movaps %xmm6, 0x60(%rdi) + movaps %xmm7, 0x70(%rdi) + lea 0x80(%rdi), %rdi + jae L(ll_cache_copy_fwd_start) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_ll_less_fwd_64bytes) + + movdqu (%rsi), %xmm0 + movdqu 0x10(%rsi), %xmm1 + movdqu 0x20(%rsi), %xmm2 + movdqu 0x30(%rsi), %xmm3 + lea 0x40(%rsi), %rsi + + movaps %xmm0, (%rdi) + movaps %xmm1, 0x10(%rdi) + movaps %xmm2, 0x20(%rdi) + movaps %xmm3, 0x30(%rdi) + lea 0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_ll_less_fwd_64bytes): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + +#endif + .p2align 4 +L(large_page_bwd): + movdqu -0x10(%rsi), %xmm1 + lea -16(%rsi), %rsi + movdqu %xmm0, (%r8) + movdqa %xmm1, -0x10(%rdi) + lea -16(%rdi), %rdi + lea -0x90(%rdx), %rdx +#ifdef USE_AS_MEMMOVE + mov %rdi, %r9 + sub %rsi, %r9 + cmp %rdx, %r9 + jae L(memmove_is_memcpy_bwd) + cmp %rcx, %r9 + jb L(ll_cache_copy_bwd_start) +L(memmove_is_memcpy_bwd): +#endif +L(large_page_bwd_loop): + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + movdqu -0x50(%rsi), %xmm4 + movdqu -0x60(%rsi), %xmm5 + movdqu -0x70(%rsi), %xmm6 + movdqu -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + + sub $0x80, %rdx + movntdq %xmm0, -0x10(%rdi) + movntdq %xmm1, -0x20(%rdi) + movntdq %xmm2, -0x30(%rdi) + movntdq %xmm3, -0x40(%rdi) + movntdq %xmm4, -0x50(%rdi) + movntdq %xmm5, -0x60(%rdi) + movntdq %xmm6, -0x70(%rdi) + movntdq %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + jae L(large_page_bwd_loop) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_less_bwd_64bytes) + + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + lea -0x40(%rsi), %rsi + + movntdq %xmm0, -0x10(%rdi) + movntdq %xmm1, -0x20(%rdi) + movntdq %xmm2, -0x30(%rdi) + movntdq %xmm3, -0x40(%rdi) + lea -0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_less_bwd_64bytes): + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) + +#ifdef USE_AS_MEMMOVE + .p2align 4 +L(ll_cache_copy_bwd_start): + prefetcht0 -0x1c0(%rsi) + prefetcht0 -0x200(%rsi) + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + movdqu -0x50(%rsi), %xmm4 + movdqu -0x60(%rsi), %xmm5 + movdqu -0x70(%rsi), %xmm6 + movdqu -0x80(%rsi), %xmm7 + lea -0x80(%rsi), %rsi + + sub $0x80, %rdx + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + movaps %xmm4, -0x50(%rdi) + movaps %xmm5, -0x60(%rdi) + movaps %xmm6, -0x70(%rdi) + movaps %xmm7, -0x80(%rdi) + lea -0x80(%rdi), %rdi + jae L(ll_cache_copy_bwd_start) + cmp $-0x40, %rdx + lea 0x80(%rdx), %rdx + jl L(large_page_ll_less_bwd_64bytes) + + movdqu -0x10(%rsi), %xmm0 + movdqu -0x20(%rsi), %xmm1 + movdqu -0x30(%rsi), %xmm2 + movdqu -0x40(%rsi), %xmm3 + lea -0x40(%rsi), %rsi + + movaps %xmm0, -0x10(%rdi) + movaps %xmm1, -0x20(%rdi) + movaps %xmm2, -0x30(%rdi) + movaps %xmm3, -0x40(%rdi) + lea -0x40(%rdi), %rdi + sub $0x40, %rdx +L(large_page_ll_less_bwd_64bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) +#endif + +END (MEMCPY) + + .section .rodata.ssse3,"a",@progbits + .p2align 3 +L(table_less_80bytes): + .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) + .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) + + .p2align 3 +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + .p2align 3 +L(shl_table_bwd): + .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) + .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S new file mode 100644 index 0000000000..af2770397c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy.S @@ -0,0 +1,75 @@ +/* Multiple versions of memcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need memcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(__new_memcpy) + .type __new_memcpy, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __memcpy_erms(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_ERMS) + jnz 2f + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memcpy_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memcpy_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __memcpy_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memcpy_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __memcpy_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __memcpy_ssse3(%rip), %RAX_LP +2: ret +END(__new_memcpy) + +# undef memcpy +# include <shlib-compat.h> +versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14); +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S new file mode 100644 index 0000000000..8737fb9755 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memcpy_chk.S @@ -0,0 +1,72 @@ +/* Multiple versions of __memcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch memcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__memcpy_chk) + .type __memcpy_chk, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memcpy_chk_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __memcpy_chk_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_chk_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memcpy_chk_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __memcpy_chk_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __memcpy_chk_ssse3(%rip), %RAX_LP +2: ret +END(__memcpy_chk) +# else +# include "../memcpy_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S new file mode 100644 index 0000000000..e195e93f15 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S @@ -0,0 +1,12 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define SECTION(p) p##.avx +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S new file mode 100644 index 0000000000..f3ef10577c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S @@ -0,0 +1,420 @@ +/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) + +# include "asm-syntax.h" + + .section .text.avx512,"ax",@progbits +# if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE +ENTRY (__mempcpy_chk_avx512_no_vzeroupper) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__mempcpy_chk_avx512_no_vzeroupper) + +ENTRY (__mempcpy_avx512_no_vzeroupper) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (__mempcpy_avx512_no_vzeroupper) +# endif + +# ifdef SHARED +ENTRY (__memmove_chk_avx512_no_vzeroupper) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memmove_chk_avx512_no_vzeroupper) +# endif + +ENTRY (__memmove_avx512_no_vzeroupper) + mov %rdi, %rax +# ifdef USE_AS_MEMPCPY + add %rdx, %rax +# endif +L(start): + lea (%rsi, %rdx), %rcx + lea (%rdi, %rdx), %r9 + cmp $512, %rdx + ja L(512bytesormore) + +L(check): + cmp $16, %rdx + jbe L(less_16bytes) + cmp $256, %rdx + jb L(less_256bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups -0x100(%rcx), %zmm4 + vmovups -0xC0(%rcx), %zmm5 + vmovups -0x80(%rcx), %zmm6 + vmovups -0x40(%rcx), %zmm7 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, -0x100(%r9) + vmovups %zmm5, -0xC0(%r9) + vmovups %zmm6, -0x80(%r9) + vmovups %zmm7, -0x40(%r9) + ret + +L(less_256bytes): + cmp $128, %dl + jb L(less_128bytes) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups -0x80(%rcx), %zmm2 + vmovups -0x40(%rcx), %zmm3 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, -0x80(%r9) + vmovups %zmm3, -0x40(%r9) + ret + +L(less_128bytes): + cmp $64, %dl + jb L(less_64bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu 0x20(%rsi), %ymm1 + vmovdqu -0x40(%rcx), %ymm2 + vmovdqu -0x20(%rcx), %ymm3 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, 0x20(%rdi) + vmovdqu %ymm2, -0x40(%r9) + vmovdqu %ymm3, -0x20(%r9) + ret + +L(less_64bytes): + cmp $32, %dl + jb L(less_32bytes) + vmovdqu (%rsi), %ymm0 + vmovdqu -0x20(%rcx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -0x20(%r9) + ret + +L(less_32bytes): + vmovdqu (%rsi), %xmm0 + vmovdqu -0x10(%rcx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -0x10(%r9) + ret + +L(less_16bytes): + cmp $8, %dl + jb L(less_8bytes) + movq (%rsi), %rsi + movq -0x8(%rcx), %rcx + movq %rsi, (%rdi) + movq %rcx, -0x8(%r9) + ret + +L(less_8bytes): + cmp $4, %dl + jb L(less_4bytes) + mov (%rsi), %esi + mov -0x4(%rcx), %ecx + mov %esi, (%rdi) + mov %ecx, -0x4(%r9) + ret + +L(less_4bytes): + cmp $2, %dl + jb L(less_2bytes) + mov (%rsi), %si + mov -0x2(%rcx), %cx + mov %si, (%rdi) + mov %cx, -0x2(%r9) + ret + +L(less_2bytes): + cmp $1, %dl + jb L(less_1bytes) + mov (%rsi), %cl + mov %cl, (%rdi) +L(less_1bytes): + ret + +L(512bytesormore): +# ifdef SHARED_CACHE_SIZE_HALF + mov $SHARED_CACHE_SIZE_HALF, %r8 +# else + mov __x86_shared_cache_size_half(%rip), %r8 +# endif + cmp %r8, %rdx + jae L(preloop_large) + cmp $1024, %rdx + ja L(1024bytesormore) + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + prefetcht1 -0x200(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0x40(%rcx) + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + vmovups %zmm8, -0x200(%r9) + vmovups %zmm9, -0x1C0(%r9) + vmovups %zmm10, -0x180(%r9) + vmovups %zmm11, -0x140(%r9) + vmovups %zmm12, -0x100(%r9) + vmovups %zmm13, -0xC0(%r9) + vmovups %zmm14, -0x80(%r9) + vmovups %zmm15, -0x40(%r9) + ret + +L(1024bytesormore): + cmp %rsi, %rdi + ja L(1024bytesormore_bkw) + sub $512, %r9 + vmovups -0x200(%rcx), %zmm8 + vmovups -0x1C0(%rcx), %zmm9 + vmovups -0x180(%rcx), %zmm10 + vmovups -0x140(%rcx), %zmm11 + vmovups -0x100(%rcx), %zmm12 + vmovups -0xC0(%rcx), %zmm13 + vmovups -0x80(%rcx), %zmm14 + vmovups -0x40(%rcx), %zmm15 + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + +/* Loop with unaligned memory access. */ +L(gobble_512bytes_loop): + vmovups (%rsi), %zmm0 + vmovups 0x40(%rsi), %zmm1 + vmovups 0x80(%rsi), %zmm2 + vmovups 0xC0(%rsi), %zmm3 + vmovups 0x100(%rsi), %zmm4 + vmovups 0x140(%rsi), %zmm5 + vmovups 0x180(%rsi), %zmm6 + vmovups 0x1C0(%rsi), %zmm7 + add $512, %rsi + prefetcht1 (%rsi) + prefetcht1 0x40(%rsi) + prefetcht1 0x80(%rsi) + prefetcht1 0xC0(%rsi) + prefetcht1 0x100(%rsi) + prefetcht1 0x140(%rsi) + prefetcht1 0x180(%rsi) + prefetcht1 0x1C0(%rsi) + vmovups %zmm0, (%rdi) + vmovups %zmm1, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm3, 0xC0(%rdi) + vmovups %zmm4, 0x100(%rdi) + vmovups %zmm5, 0x140(%rdi) + vmovups %zmm6, 0x180(%rdi) + vmovups %zmm7, 0x1C0(%rdi) + add $512, %rdi + cmp %r9, %rdi + jb L(gobble_512bytes_loop) + vmovups %zmm8, (%r9) + vmovups %zmm9, 0x40(%r9) + vmovups %zmm10, 0x80(%r9) + vmovups %zmm11, 0xC0(%r9) + vmovups %zmm12, 0x100(%r9) + vmovups %zmm13, 0x140(%r9) + vmovups %zmm14, 0x180(%r9) + vmovups %zmm15, 0x1C0(%r9) + ret + +L(1024bytesormore_bkw): + add $512, %rdi + vmovups 0x1C0(%rsi), %zmm8 + vmovups 0x180(%rsi), %zmm9 + vmovups 0x140(%rsi), %zmm10 + vmovups 0x100(%rsi), %zmm11 + vmovups 0xC0(%rsi), %zmm12 + vmovups 0x80(%rsi), %zmm13 + vmovups 0x40(%rsi), %zmm14 + vmovups (%rsi), %zmm15 + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + +/* Backward loop with unaligned memory access. */ +L(gobble_512bytes_loop_bkw): + vmovups -0x40(%rcx), %zmm0 + vmovups -0x80(%rcx), %zmm1 + vmovups -0xC0(%rcx), %zmm2 + vmovups -0x100(%rcx), %zmm3 + vmovups -0x140(%rcx), %zmm4 + vmovups -0x180(%rcx), %zmm5 + vmovups -0x1C0(%rcx), %zmm6 + vmovups -0x200(%rcx), %zmm7 + sub $512, %rcx + prefetcht1 -0x40(%rcx) + prefetcht1 -0x80(%rcx) + prefetcht1 -0xC0(%rcx) + prefetcht1 -0x100(%rcx) + prefetcht1 -0x140(%rcx) + prefetcht1 -0x180(%rcx) + prefetcht1 -0x1C0(%rcx) + prefetcht1 -0x200(%rcx) + vmovups %zmm0, -0x40(%r9) + vmovups %zmm1, -0x80(%r9) + vmovups %zmm2, -0xC0(%r9) + vmovups %zmm3, -0x100(%r9) + vmovups %zmm4, -0x140(%r9) + vmovups %zmm5, -0x180(%r9) + vmovups %zmm6, -0x1C0(%r9) + vmovups %zmm7, -0x200(%r9) + sub $512, %r9 + cmp %rdi, %r9 + ja L(gobble_512bytes_loop_bkw) + vmovups %zmm8, -0x40(%rdi) + vmovups %zmm9, -0x80(%rdi) + vmovups %zmm10, -0xC0(%rdi) + vmovups %zmm11, -0x100(%rdi) + vmovups %zmm12, -0x140(%rdi) + vmovups %zmm13, -0x180(%rdi) + vmovups %zmm14, -0x1C0(%rdi) + vmovups %zmm15, -0x200(%rdi) + ret + +L(preloop_large): + cmp %rsi, %rdi + ja L(preloop_large_bkw) + vmovups (%rsi), %zmm4 + vmovups 0x40(%rsi), %zmm5 + +/* Align destination for access with non-temporal stores in the loop. */ + mov %rdi, %r8 + and $-0x80, %rdi + add $0x80, %rdi + sub %rdi, %r8 + sub %r8, %rsi + add %r8, %rdx +L(gobble_256bytes_nt_loop): + prefetcht1 0x200(%rsi) + prefetcht1 0x240(%rsi) + prefetcht1 0x280(%rsi) + prefetcht1 0x2C0(%rsi) + prefetcht1 0x300(%rsi) + prefetcht1 0x340(%rsi) + prefetcht1 0x380(%rsi) + prefetcht1 0x3C0(%rsi) + vmovdqu64 (%rsi), %zmm0 + vmovdqu64 0x40(%rsi), %zmm1 + vmovdqu64 0x80(%rsi), %zmm2 + vmovdqu64 0xC0(%rsi), %zmm3 + vmovntdq %zmm0, (%rdi) + vmovntdq %zmm1, 0x40(%rdi) + vmovntdq %zmm2, 0x80(%rdi) + vmovntdq %zmm3, 0xC0(%rdi) + sub $256, %rdx + add $256, %rsi + add $256, %rdi + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop) + sfence + vmovups %zmm4, (%rax) + vmovups %zmm5, 0x40(%rax) + jmp L(check) + +L(preloop_large_bkw): + vmovups -0x80(%rcx), %zmm4 + vmovups -0x40(%rcx), %zmm5 + +/* Align end of destination for access with non-temporal stores. */ + mov %r9, %r8 + and $-0x80, %r9 + sub %r9, %r8 + sub %r8, %rcx + sub %r8, %rdx + add %r9, %r8 +L(gobble_256bytes_nt_loop_bkw): + prefetcht1 -0x400(%rcx) + prefetcht1 -0x3C0(%rcx) + prefetcht1 -0x380(%rcx) + prefetcht1 -0x340(%rcx) + prefetcht1 -0x300(%rcx) + prefetcht1 -0x2C0(%rcx) + prefetcht1 -0x280(%rcx) + prefetcht1 -0x240(%rcx) + vmovdqu64 -0x100(%rcx), %zmm0 + vmovdqu64 -0xC0(%rcx), %zmm1 + vmovdqu64 -0x80(%rcx), %zmm2 + vmovdqu64 -0x40(%rcx), %zmm3 + vmovntdq %zmm0, -0x100(%r9) + vmovntdq %zmm1, -0xC0(%r9) + vmovntdq %zmm2, -0x80(%r9) + vmovntdq %zmm3, -0x40(%r9) + sub $256, %rdx + sub $256, %rcx + sub $256, %r9 + cmp $256, %rdx + ja L(gobble_256bytes_nt_loop_bkw) + sfence + vmovups %zmm4, -0x80(%r8) + vmovups %zmm5, -0x40(%r8) + jmp L(check) +END (__memmove_avx512_no_vzeroupper) + +# ifdef SHARED +strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper) +strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper) +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S new file mode 100644 index 0000000000..aac1515cf6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S @@ -0,0 +1,12 @@ +#if IS_IN (libc) +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# define SECTION(p) p##.avx512 +# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S new file mode 100644 index 0000000000..f9a4e9aff9 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3-back.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3_back +#define MEMCPY_CHK __memmove_chk_ssse3_back +#include "memcpy-ssse3-back.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S new file mode 100644 index 0000000000..295430b1ef --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3 +#define MEMCPY_CHK __memmove_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S new file mode 100644 index 0000000000..dee3ec529c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -0,0 +1,553 @@ +/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* memmove/memcpy/mempcpy is implemented as: + 1. Use overlapping load and store to avoid branch. + 2. Load all sources into registers and store them together to avoid + possible address overlap between source and destination. + 3. If size is 8 * VEC_SIZE or less, load all sources into registers + and store them together. + 4. If address of destination > address of source, backward copy + 4 * VEC_SIZE at a time with unaligned load and aligned store. + Load the first 4 * VEC and last VEC before the loop and store + them after the loop to support overlapping addresses. + 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned + load and aligned store. Load the last 4 * VEC and first VEC + before the loop and store them after the loop to support + overlapping addresses. + 6. If size >= __x86_shared_non_temporal_threshold and there is no + overlap between destination and source, use non-temporal store + instead of aligned store. */ + +#include <sysdep.h> + +#ifndef MEMCPY_SYMBOL +# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef MEMPCPY_SYMBOL +# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef MEMMOVE_CHK_SYMBOL +# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) +#endif + +#ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +#endif + +/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set + up REP MOVSB operation, REP MOVSB isn't faster on short data. The + memcpy micro benchmark in glibc shows that 2KB is the approximate + value above which REP MOVSB becomes faster than SSE2 optimization + on processors with Enhanced REP MOVSB. Since larger register size + can move more data with a single load and store, the threshold is + higher with larger register size. */ +#ifndef REP_MOVSB_THRESHOLD +# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) +#endif + +#ifndef PREFETCH +# define PREFETCH(addr) prefetcht0 addr +#endif + +/* Assume 64-byte prefetch size. */ +#ifndef PREFETCH_SIZE +# define PREFETCH_SIZE 64 +#endif + +#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) + +#if PREFETCH_SIZE == 64 +# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base) +# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base) +# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE +# define PREFETCH_ONE_SET(dir, base, offset) \ + PREFETCH ((offset)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ + PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) +# else +# error Unsupported PREFETCHED_LOAD_SIZE! +# endif +#else +# error Unsupported PREFETCH_SIZE! +#endif + +#ifndef SECTION +# error SECTION is not defined! +#endif + + .section SECTION(.text),"ax",@progbits +#if defined SHARED && IS_IN (libc) +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) +#endif + +#if VEC_SIZE == 16 || defined SHARED +ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) +#endif + +#if defined SHARED && IS_IN (libc) +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) +#endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) + movq %rdi, %rax +L(start): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) +#if !defined USE_MULTIARCH || !IS_IN (libc) +L(last_2x_vec): +#endif + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + VZEROUPPER +#if !defined USE_MULTIARCH || !IS_IN (libc) +L(nop): +#endif + ret +#if defined USE_MULTIARCH && IS_IN (libc) +END (MEMMOVE_SYMBOL (__memmove, unaligned)) + +# if VEC_SIZE == 16 +# if defined SHARED +/* Only used to measure performance of REP MOVSB. */ +ENTRY (__mempcpy_erms) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start_movsb) +END (__mempcpy_erms) +# endif + +ENTRY (__memmove_erms) + movq %rdi, %rax +L(start_movsb): + movq %rdx, %rcx + cmpq %rsi, %rdi + jb 1f + /* Source == destination is less common. */ + je 2f + leaq (%rsi,%rcx), %rdx + cmpq %rdx, %rdi + jb L(movsb_backward) +1: + rep movsb +2: + ret +L(movsb_backward): + leaq -1(%rdi,%rcx), %rdi + leaq -1(%rsi,%rcx), %rsi + std + rep movsb + cld + ret +END (__memmove_erms) +# if defined SHARED +strong_alias (__memmove_erms, __memcpy_erms) +# endif +# endif + +# ifdef SHARED +ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) + +ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start_erms) +END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + +ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) +# endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + movq %rdi, %rax +L(start_erms): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(movsb_more_2x_vec) +L(last_2x_vec): + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) +L(return): + VZEROUPPER + ret + +L(movsb): + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx + jae L(more_8x_vec) + cmpq %rsi, %rdi + jb 1f + /* Source == destination is less common. */ + je L(nop) + leaq (%rsi,%rdx), %r9 + cmpq %r9, %rdi + /* Avoid slow backward REP MOVSB. */ +# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8) +# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE! +# endif + jb L(more_8x_vec_backward) +1: + movq %rdx, %rcx + rep movsb +L(nop): + ret +#endif + +L(less_vec): + /* Less than 1 VEC. */ +#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +#endif +#if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +#endif +#if VEC_SIZE > 16 + cmpb $16, %dl + jae L(between_16_31) +#endif + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movzbl (%rsi), %ecx + movb %cl, (%rdi) +1: + ret +#if VEC_SIZE > 32 +L(between_32_63): + /* From 32 to 63. No branch when size == 32. */ + vmovdqu (%rsi), %ymm0 + vmovdqu -32(%rsi,%rdx), %ymm1 + vmovdqu %ymm0, (%rdi) + vmovdqu %ymm1, -32(%rdi,%rdx) + VZEROUPPER + ret +#endif +#if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +L(between_16_31): + vmovdqu (%rsi), %xmm0 + vmovdqu -16(%rsi,%rdx), %xmm1 + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm1, -16(%rdi,%rdx) + ret +#endif +L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ + movq -8(%rsi,%rdx), %rcx + movq (%rsi), %rsi + movq %rcx, -8(%rdi,%rdx) + movq %rsi, (%rdi) + ret +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl -4(%rsi,%rdx), %ecx + movl (%rsi), %esi + movl %ecx, -4(%rdi,%rdx) + movl %esi, (%rdi) + ret +L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movzwl -2(%rsi,%rdx), %ecx + movzwl (%rsi), %esi + movw %cx, -2(%rdi,%rdx) + movw %si, (%rdi) + ret + +#if defined USE_MULTIARCH && IS_IN (libc) +L(movsb_more_2x_vec): + cmpq $REP_MOVSB_THRESHOLD, %rdx + ja L(movsb) +#endif +L(more_2x_vec): + /* More than 2 * VEC and there may be overlap between destination + and source. */ + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx + jb L(last_4x_vec) + /* Copy from 4 * VEC to 8 * VEC, inclusively. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5) + VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6) + VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) + VZEROUPPER + ret +L(last_4x_vec): + /* Copy from 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + VZEROUPPER + ret + +L(more_8x_vec): + cmpq %rsi, %rdi + ja L(more_8x_vec_backward) + /* Source == destination is less common. */ + je L(nop) + /* Load the first VEC and last 4 * VEC to support overlapping + addresses. */ + VMOVU (%rsi), %VEC(4) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) + /* Save start and stop of the destination buffer. */ + movq %rdi, %r11 + leaq -VEC_SIZE(%rdi, %rdx), %rcx + /* Align destination for aligned stores in the loop. Compute + how much destination is misaligned. */ + movq %rdi, %r8 + andq $(VEC_SIZE - 1), %r8 + /* Get the negative of offset for alignment. */ + subq $VEC_SIZE, %r8 + /* Adjust source. */ + subq %r8, %rsi + /* Adjust destination which should be aligned now. */ + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx + ja L(large_forward) +#endif +L(loop_4x_vec_forward): + /* Copy 4 * VEC a time forward. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $(VEC_SIZE * 4), %rsi + subq $(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $(VEC_SIZE * 4), %rdi + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_forward) + /* Store the last 4 * VEC. */ + VMOVU %VEC(5), (%rcx) + VMOVU %VEC(6), -VEC_SIZE(%rcx) + VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) + VZEROUPPER + ret + +L(more_8x_vec_backward): + /* Load the first 4 * VEC and last VEC to support overlapping + addresses. */ + VMOVU (%rsi), %VEC(4) + VMOVU VEC_SIZE(%rsi), %VEC(5) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8) + /* Save stop of the destination buffer. */ + leaq -VEC_SIZE(%rdi, %rdx), %r11 + /* Align destination end for aligned stores in the loop. Compute + how much destination end is misaligned. */ + leaq -VEC_SIZE(%rsi, %rdx), %rcx + movq %r11, %r9 + movq %r11, %r8 + andq $(VEC_SIZE - 1), %r8 + /* Adjust source. */ + subq %r8, %rcx + /* Adjust the end of destination which should be aligned now. */ + subq %r8, %r9 + /* Adjust length. */ + subq %r8, %rdx +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx + ja L(large_backward) +#endif +L(loop_4x_vec_backward): + /* Copy 4 * VEC a time backward. */ + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + subq $(VEC_SIZE * 4), %rcx + subq $(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%r9) + VMOVA %VEC(1), -VEC_SIZE(%r9) + VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) + subq $(VEC_SIZE * 4), %r9 + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_backward) + /* Store the first 4 * VEC. */ + VMOVU %VEC(4), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) + VZEROUPPER + ret + +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) +L(large_forward): + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache + when source is loaded. */ + leaq (%rdi, %rdx), %r10 + cmpq %r10, %rsi + jb L(loop_4x_vec_forward) +L(loop_large_forward): + /* Copy 4 * VEC a time forward with non-temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + addq $PREFETCHED_LOAD_SIZE, %rsi + subq $PREFETCHED_LOAD_SIZE, %rdx + VMOVNT %VEC(0), (%rdi) + VMOVNT %VEC(1), VEC_SIZE(%rdi) + VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) + addq $PREFETCHED_LOAD_SIZE, %rdi + cmpq $PREFETCHED_LOAD_SIZE, %rdx + ja L(loop_large_forward) + sfence + /* Store the last 4 * VEC. */ + VMOVU %VEC(5), (%rcx) + VMOVU %VEC(6), -VEC_SIZE(%rcx) + VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) + VZEROUPPER + ret + +L(large_backward): + /* Don't use non-temporal store if there is overlap between + destination and source since destination may be in cache + when source is loaded. */ + leaq (%rcx, %rdx), %r10 + cmpq %r10, %r9 + jb L(loop_4x_vec_backward) +L(loop_large_backward): + /* Copy 4 * VEC a time backward with non-temporal stores. */ + PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) + subq $PREFETCHED_LOAD_SIZE, %rcx + subq $PREFETCHED_LOAD_SIZE, %rdx + VMOVNT %VEC(0), (%r9) + VMOVNT %VEC(1), -VEC_SIZE(%r9) + VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) + subq $PREFETCHED_LOAD_SIZE, %r9 + cmpq $PREFETCHED_LOAD_SIZE, %rdx + ja L(loop_large_backward) + sfence + /* Store the first 4 * VEC. */ + VMOVU %VEC(4), (%rdi) + VMOVU %VEC(5), VEC_SIZE(%rdi) + VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) + VZEROUPPER + ret +#endif +END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + +#ifdef SHARED +# if IS_IN (libc) +# ifdef USE_MULTIARCH +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), + MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) +strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), + MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) +# endif +strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), + MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) +# endif +#endif +#if VEC_SIZE == 16 || defined SHARED +strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), + MEMCPY_SYMBOL (__memcpy, unaligned)) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S new file mode 100644 index 0000000000..8c534e83e0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove.S @@ -0,0 +1,101 @@ +/* Multiple versions of memmove + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. */ +#if IS_IN (libc) + .text +ENTRY(__libc_memmove) + .type __libc_memmove, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __memmove_erms(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_ERMS) + jnz 2f + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __memmove_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memmove_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memmove_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __memmove_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memmove_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __memmove_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __memmove_ssse3(%rip), %RAX_LP +2: ret +END(__libc_memmove) +#endif + +#if IS_IN (libc) +# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s + +# ifdef SHARED +libc_hidden_ver (__memmove_sse2_unaligned, memmove) +libc_hidden_ver (__memcpy_sse2_unaligned, memcpy) +libc_hidden_ver (__mempcpy_sse2_unaligned, mempcpy) +libc_hidden_ver (__mempcpy_sse2_unaligned, __mempcpy) + +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal memmove calls through a PLT. + The speedup we get from using SSE2 instructions is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def +# endif +strong_alias (__libc_memmove, memmove) +#endif + +#if !defined SHARED || !IS_IN (libc) +weak_alias (__mempcpy, mempcpy) +#endif + +#include "../memmove.S" + +#if defined SHARED && IS_IN (libc) +# include <shlib-compat.h> +# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14) +/* Use __memmove_sse2_unaligned to support overlapping addresses. */ +compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5); +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S new file mode 100644 index 0000000000..7870dd0247 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memmove_chk.S @@ -0,0 +1,71 @@ +/* Multiple versions of __memmove_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch memmove functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__memmove_chk) + .type __memmove_chk, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __memmove_chk_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memmove_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memmove_chk_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __memmove_chk_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_chk_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __memmove_chk_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __memmove_chk_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __memmove_chk_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __memmove_chk_ssse3(%rip), %RAX_LP +2: ret +END(__memmove_chk) +# else +# include "../memmove_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S new file mode 100644 index 0000000000..b8b2b28094 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy.S @@ -0,0 +1,73 @@ +/* Multiple versions of mempcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need mempcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(__mempcpy) + .type __mempcpy, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __mempcpy_erms(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_ERMS) + jnz 2f + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __mempcpy_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __mempcpy_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __mempcpy_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __mempcpy_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __mempcpy_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __mempcpy_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __mempcpy_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __mempcpy_ssse3(%rip), %RAX_LP +2: ret +END(__mempcpy) + +weak_alias (__mempcpy, mempcpy) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S new file mode 100644 index 0000000000..072b22c49f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/mempcpy_chk.S @@ -0,0 +1,72 @@ +/* Multiple versions of __mempcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch mempcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__mempcpy_chk) + .type __mempcpy_chk, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 1f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 1f + lea __mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __mempcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __mempcpy_chk_avx512_unaligned(%rip), %RAX_LP + ret +1: lea __mempcpy_chk_avx_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz L(Fast_Unaligned_Load) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __mempcpy_chk_avx_unaligned_erms(%rip), %RAX_LP + ret +L(Fast_Unaligned_Load): + lea __mempcpy_chk_sse2_unaligned(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Unaligned_Copy) + jz L(SSSE3) + HAS_CPU_FEATURE (ERMS) + jz 2f + lea __mempcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP + ret +L(SSSE3): + HAS_CPU_FEATURE (SSSE3) + jz 2f + lea __mempcpy_chk_ssse3_back(%rip), %RAX_LP + HAS_ARCH_FEATURE (Fast_Copy_Backward) + jnz 2f + lea __mempcpy_chk_ssse3(%rip), %RAX_LP +2: ret +END(__mempcpy_chk) +# else +# include "../mempcpy_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S new file mode 100644 index 0000000000..7ab3d89849 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -0,0 +1,22 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastb %xmm0, %ymm0 + +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %ymm0 + +# define SECTION(p) p##.avx +# define MEMSET_SYMBOL(p,s) p##_avx2_##s +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S new file mode 100644 index 0000000000..1f66602398 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S @@ -0,0 +1,194 @@ +/* memset optimized with AVX512 for KNL hardware. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) + +#include "asm-syntax.h" +#ifndef MEMSET +# define MEMSET __memset_avx512_no_vzeroupper +# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper +#endif + + .section .text.avx512,"ax",@progbits +#if defined PIC +ENTRY (MEMSET_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMSET_CHK) +#endif + +ENTRY (MEMSET) + vpxor %xmm0, %xmm0, %xmm0 + vmovd %esi, %xmm1 + lea (%rdi, %rdx), %rsi + mov %rdi, %rax + vpshufb %xmm0, %xmm1, %xmm0 + cmp $16, %rdx + jb L(less_16bytes) + cmp $512, %rdx + vbroadcastss %xmm0, %zmm2 + ja L(512bytesormore) + cmp $256, %rdx + jb L(less_256bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm2, 0xC0(%rdi) + vmovups %zmm2, -0x100(%rsi) + vmovups %zmm2, -0xC0(%rsi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_256bytes): + cmp $128, %dl + jb L(less_128bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_128bytes): + cmp $64, %dl + jb L(less_64bytes) + vmovups %zmm2, (%rdi) + vmovups %zmm2, -0x40(%rsi) + ret + +L(less_64bytes): + cmp $32, %dl + jb L(less_32bytes) + vmovdqu %ymm2, (%rdi) + vmovdqu %ymm2, -0x20(%rsi) + ret + +L(less_32bytes): + vmovdqu %xmm0, (%rdi) + vmovdqu %xmm0, -0x10(%rsi) + ret + +L(less_16bytes): + cmp $8, %dl + jb L(less_8bytes) + vmovq %xmm0, (%rdi) + vmovq %xmm0, -0x08(%rsi) + ret + +L(less_8bytes): + vmovd %xmm0, %ecx + cmp $4, %dl + jb L(less_4bytes) + mov %ecx, (%rdi) + mov %ecx, -0x04(%rsi) + ret + +L(less_4bytes): + cmp $2, %dl + jb L(less_2bytes) + mov %cx, (%rdi) + mov %cx, -0x02(%rsi) + ret + +L(less_2bytes): + cmp $1, %dl + jb L(less_1bytes) + mov %cl, (%rdi) +L(less_1bytes): + ret + +L(512bytesormore): + mov __x86_shared_cache_size_half(%rip), %rcx + cmp %rcx, %rdx + ja L(preloop_large) + cmp $1024, %rdx + ja L(1024bytesormore) + + vmovups %zmm2, (%rdi) + vmovups %zmm2, 0x40(%rdi) + vmovups %zmm2, 0x80(%rdi) + vmovups %zmm2, 0xC0(%rdi) + vmovups %zmm2, 0x100(%rdi) + vmovups %zmm2, 0x140(%rdi) + vmovups %zmm2, 0x180(%rdi) + vmovups %zmm2, 0x1C0(%rdi) + vmovups %zmm2, -0x200(%rsi) + vmovups %zmm2, -0x1C0(%rsi) + vmovups %zmm2, -0x180(%rsi) + vmovups %zmm2, -0x140(%rsi) + vmovups %zmm2, -0x100(%rsi) + vmovups %zmm2, -0xC0(%rsi) + vmovups %zmm2, -0x80(%rsi) + vmovups %zmm2, -0x40(%rsi) + ret + +/* Align on 64 and loop with aligned stores. */ +L(1024bytesormore): + sub $0x100, %rsi + vmovups %zmm2, (%rax) + and $-0x40, %rdi + add $0x40, %rdi + +L(gobble_256bytes_loop): + vmovaps %zmm2, (%rdi) + vmovaps %zmm2, 0x40(%rdi) + vmovaps %zmm2, 0x80(%rdi) + vmovaps %zmm2, 0xC0(%rdi) + add $0x100, %rdi + cmp %rsi, %rdi + jb L(gobble_256bytes_loop) + vmovups %zmm2, (%rsi) + vmovups %zmm2, 0x40(%rsi) + vmovups %zmm2, 0x80(%rsi) + vmovups %zmm2, 0xC0(%rsi) + ret + +/* Align on 128 and loop with non-temporal stores. */ +L(preloop_large): + and $-0x80, %rdi + add $0x80, %rdi + vmovups %zmm2, (%rax) + vmovups %zmm2, 0x40(%rax) + sub $0x200, %rsi + +L(gobble_512bytes_nt_loop): + vmovntdq %zmm2, (%rdi) + vmovntdq %zmm2, 0x40(%rdi) + vmovntdq %zmm2, 0x80(%rdi) + vmovntdq %zmm2, 0xC0(%rdi) + vmovntdq %zmm2, 0x100(%rdi) + vmovntdq %zmm2, 0x140(%rdi) + vmovntdq %zmm2, 0x180(%rdi) + vmovntdq %zmm2, 0x1C0(%rdi) + add $0x200, %rdi + cmp %rsi, %rdi + jb L(gobble_512bytes_nt_loop) + sfence + vmovups %zmm2, (%rsi) + vmovups %zmm2, 0x40(%rsi) + vmovups %zmm2, 0x80(%rsi) + vmovups %zmm2, 0xC0(%rsi) + vmovups %zmm2, 0x100(%rsi) + vmovups %zmm2, 0x140(%rsi) + vmovups %zmm2, 0x180(%rsi) + vmovups %zmm2, 0x1C0(%rsi) + ret +END (MEMSET) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S new file mode 100644 index 0000000000..0783979ca5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S @@ -0,0 +1,24 @@ +#if IS_IN (libc) +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastb %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ + movq r, %rax; \ + vpbroadcastd %xmm0, %xmm0; \ + vpbroadcastq %xmm0, %zmm0 + +# define SECTION(p) p##.avx512 +# define MEMSET_SYMBOL(p,s) p##_avx512_##s +# define WMEMSET_SYMBOL(p,s) p##_avx512_##s + +# include "memset-vec-unaligned-erms.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S new file mode 100644 index 0000000000..2eb9e3744e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -0,0 +1,263 @@ +/* memset/bzero with unaligned store and rep stosb + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* memset is implemented as: + 1. Use overlapping store to avoid branch. + 2. If size is less than VEC, use integer register stores. + 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. + 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with + 4 VEC stores and store 4 * VEC at a time until done. */ + +#include <sysdep.h> + +#ifndef MEMSET_CHK_SYMBOL +# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) +#endif + +#ifndef WMEMSET_CHK_SYMBOL +# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) +#endif + +#ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +#endif + +#ifndef VZEROUPPER_SHORT_RETURN +# if VEC_SIZE > 16 +# define VZEROUPPER_SHORT_RETURN vzeroupper +# else +# define VZEROUPPER_SHORT_RETURN rep +# endif +#endif + +#ifndef MOVQ +# if VEC_SIZE > 16 +# define MOVQ vmovq +# else +# define MOVQ movq +# endif +#endif + +/* Threshold to use Enhanced REP STOSB. Since there is overhead to set + up REP STOSB operation, REP STOSB isn't faster on short data. The + memset micro benchmark in glibc shows that 2KB is the approximate + value above which REP STOSB becomes faster on processors with + Enhanced REP STOSB. Since the stored value is fixed, larger register + size has minimal impact on threshold. */ +#ifndef REP_STOSB_THRESHOLD +# define REP_STOSB_THRESHOLD 2048 +#endif + +#ifndef SECTION +# error SECTION is not defined! +#endif + + .section SECTION(.text),"ax",@progbits +#if VEC_SIZE == 16 && IS_IN (libc) +ENTRY (__bzero) + movq %rdi, %rax /* Set return value. */ + movq %rsi, %rdx /* Set n. */ + pxor %xmm0, %xmm0 + jmp L(entry_from_bzero) +END (__bzero) +weak_alias (__bzero, bzero) +#endif + +#if IS_IN (libc) +# if defined SHARED +ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +# endif + +ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shlq $2, %rdx + WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + jmp L(entry_from_bzero) +END (WMEMSET_SYMBOL (__wmemset, unaligned)) +#endif + +#if defined SHARED && IS_IN (libc) +ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) +#endif + +ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) + VZEROUPPER + ret +#if defined USE_MULTIARCH && IS_IN (libc) +END (MEMSET_SYMBOL (__memset, unaligned)) + +# if VEC_SIZE == 16 +/* Only used to measure performance of REP STOSB. */ +ENTRY (__memset_erms) +# else +/* Provide a symbol to debugger. */ +ENTRY (MEMSET_SYMBOL (__memset, erms)) +# endif +L(stosb): + /* Issue vzeroupper before rep stosb. */ + VZEROUPPER + movq %rdx, %rcx + movzbl %sil, %eax + movq %rdi, %rdx + rep stosb + movq %rdx, %rax + ret +# if VEC_SIZE == 16 +END (__memset_erms) +# else +END (MEMSET_SYMBOL (__memset, erms)) +# endif + +# if defined SHARED && IS_IN (libc) +ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) +# endif + +ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(stosb_more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) + VZEROUPPER + ret + +L(stosb_more_2x_vec): + cmpq $REP_STOSB_THRESHOLD, %rdx + ja L(stosb) +#endif +L(more_2x_vec): + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_start) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), VEC_SIZE(%rdi) + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) +L(return): + VZEROUPPER + ret + +L(loop_start): + leaq (VEC_SIZE * 4)(%rdi), %rcx + VMOVU %VEC(0), (%rdi) + andq $-(VEC_SIZE * 4), %rcx + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), VEC_SIZE(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx) + addq %rdi, %rdx + andq $-(VEC_SIZE * 4), %rdx + cmpq %rdx, %rcx + je L(return) +L(loop): + VMOVA %VEC(0), (%rcx) + VMOVA %VEC(0), VEC_SIZE(%rcx) + VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx) + VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx) + addq $(VEC_SIZE * 4), %rcx + cmpq %rcx, %rdx + jne L(loop) + VZEROUPPER_SHORT_RETURN + ret +L(less_vec): + /* Less than 1 VEC. */ +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +# error Unsupported VEC_SIZE! +# endif +# if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +# endif +# if VEC_SIZE > 16 + cmpb $16, %dl + jae L(between_16_31) +# endif + MOVQ %xmm0, %rcx + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl + jae L(between_4_7) + cmpb $1, %dl + ja L(between_2_3) + jb 1f + movb %cl, (%rdi) +1: + VZEROUPPER + ret +# if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ +L(between_32_63): + vmovdqu %ymm0, -32(%rdi,%rdx) + vmovdqu %ymm0, (%rdi) + VZEROUPPER + ret +# endif +# if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +L(between_16_31): + vmovdqu %xmm0, -16(%rdi,%rdx) + vmovdqu %xmm0, (%rdi) + VZEROUPPER + ret +# endif + /* From 8 to 15. No branch when size == 8. */ +L(between_8_15): + movq %rcx, -8(%rdi,%rdx) + movq %rcx, (%rdi) + VZEROUPPER + ret +L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl %ecx, -4(%rdi,%rdx) + movl %ecx, (%rdi) + VZEROUPPER + ret +L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movw %cx, -2(%rdi,%rdx) + movw %cx, (%rdi) + VZEROUPPER + ret +END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset.S new file mode 100644 index 0000000000..11f27378b0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset.S @@ -0,0 +1,82 @@ +/* Multiple versions of memset + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <shlib-compat.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) +ENTRY(memset) + .type memset, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __memset_erms(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_ERMS) + jnz 2f + lea __memset_sse2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 1f + lea __memset_sse2_unaligned(%rip), %RAX_LP +1: + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + lea __memset_avx2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz L(AVX512F) + lea __memset_avx2_unaligned(%rip), %RAX_LP +L(AVX512F): + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 2f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 2f + lea __memset_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memset_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memset_avx512_unaligned(%rip), %RAX_LP +2: ret +END(memset) +#endif + +#if IS_IN (libc) +# define MEMSET_SYMBOL(p,s) p##_sse2_##s +# define WMEMSET_SYMBOL(p,s) p##_sse2_##s + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal memset calls through a PLT. + The speedup we get from using SSE2 instructions is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \ + .globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \ + .globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned +# endif + +# undef weak_alias +# define weak_alias(original, alias) \ + .weak bzero; bzero = __bzero + +# undef strong_alias +# define strong_alias(original, alias) +#endif + +#include "../memset.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S b/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S new file mode 100644 index 0000000000..7e08311cdf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/memset_chk.S @@ -0,0 +1,61 @@ +/* Multiple versions of memset_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) +# ifdef SHARED +ENTRY(__memset_chk) + .type __memset_chk, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + lea __memset_chk_sse2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 1f + lea __memset_chk_sse2_unaligned(%rip), %RAX_LP +1: + HAS_ARCH_FEATURE (AVX2_Usable) + jz 2f + lea __memset_chk_avx2_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz L(AVX512F) + lea __memset_chk_avx2_unaligned(%rip), %RAX_LP +L(AVX512F): + HAS_ARCH_FEATURE (Prefer_No_AVX512) + jnz 2f + HAS_ARCH_FEATURE (AVX512F_Usable) + jz 2f + lea __memset_chk_avx512_no_vzeroupper(%rip), %RAX_LP + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 2f + lea __memset_chk_avx512_unaligned_erms(%rip), %RAX_LP + HAS_CPU_FEATURE (ERMS) + jnz 2f + lea __memset_chk_avx512_unaligned(%rip), %RAX_LP +2: ret +END(__memset_chk) + +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +# else +# include "../memset_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c new file mode 100644 index 0000000000..453f183747 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/sched_cpucount.c @@ -0,0 +1,36 @@ +/* Count bits in CPU set. x86-64 multi-arch version. + This file is part of the GNU C Library. + Copyright (C) 2008-2017 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@redhat.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sched.h> +#include "init-arch.h" + +#define __sched_cpucount static generic_cpucount +#include <posix/sched_cpucount.c> +#undef __sched_cpucount + +#define POPCNT(l) \ + ({ __cpu_mask r; \ + asm ("popcnt %1, %0" : "=r" (r) : "0" (l));\ + r; }) +#define __sched_cpucount static popcount_cpucount +#include <posix/sched_cpucount.c> +#undef __sched_cpucount + +libc_ifunc (__sched_cpucount, + HAS_CPU_FEATURE (POPCOUNT) ? popcount_cpucount : generic_cpucount); diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S new file mode 100644 index 0000000000..34231f8b46 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_sse2_unaligned +#include "strcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S new file mode 100644 index 0000000000..d971c2da38 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S new file mode 100644 index 0000000000..ee81ab6ae3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpcpy.S @@ -0,0 +1,9 @@ +/* Multiple versions of stpcpy + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c new file mode 100644 index 0000000000..2fde77dcab --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-c.c @@ -0,0 +1,8 @@ +#define STPNCPY __stpncpy_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2); +#endif + +#include "stpncpy.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S new file mode 100644 index 0000000000..658520f78f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_sse2_unaligned +#include "strcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S new file mode 100644 index 0000000000..14ed16f6b5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S new file mode 100644 index 0000000000..2698ca6a8c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/stpncpy.S @@ -0,0 +1,8 @@ +/* Multiple versions of stpncpy + All versions must be listed in ifunc-impl-list.c. */ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S new file mode 100644 index 0000000000..fb2f9ae14a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S @@ -0,0 +1,6 @@ +#define USE_SSSE3 1 +#define USE_AS_STRCASECMP_L +#define NO_NOLOCALE_ALIAS +#define STRCMP __strcasecmp_l_ssse3 +#define __strcasecmp __strcasecmp_ssse3 +#include "../strcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S new file mode 100644 index 0000000000..49f5b9fd95 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcasecmp_l.S @@ -0,0 +1,8 @@ +/* Multiple versions of strcasecmp and strcasecmp_l + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP __strcasecmp_l +#define USE_AS_STRCASECMP_L +#include "strcmp.S" + +weak_alias (__strcasecmp_l, strcasecmp_l) +libc_hidden_def (strcasecmp_l) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S new file mode 100644 index 0000000000..d0a8a1518a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -0,0 +1,279 @@ +/* strcat with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCAT +# define STRCAT __strcat_sse2_unaligned +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) + mov %rdi, %r9 +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + +/* Inline corresponding strlen file, temporary until new strcpy + implementation gets merged. */ + + xor %rax, %rax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %rdi, %rax + and $-16, %rax + jmp L(align16_start) +L(next): + mov %rdi, %rax + and $-16, %rax + pcmpeqb (%rax), %xmm0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + pmovmskb %xmm0, %edx + and %r10d, %edx + jnz L(exit) + +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pcmpeqb 16(%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 80(%rax), %xmm0 + add $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm1 + add $16, %rax + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm2 + add $16, %rax + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm3 + add $16, %rax + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit) + + add $16, %rax + .p2align 4 + L(align64_loop): + movaps (%rax), %xmm4 + pminub 16(%rax), %xmm4 + movaps 32(%rax), %xmm5 + pminub 48(%rax), %xmm5 + add $64, %rax + pminub %xmm4, %xmm5 + pcmpeqb %xmm0, %xmm5 + pmovmskb %xmm5, %edx + test %edx, %edx + jz L(align64_loop) + + pcmpeqb -64(%rax), %xmm0 + sub $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_less16): + bsf %rdx, %rdx + add %rdx, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit16): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $16, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit32): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $32, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit48): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $48, %rax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit64): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + + .p2align 4 +L(StartStrcpyPart): + lea (%r9, %rax), %rdi + mov %rsi, %rcx + mov %r9, %rax /* save result */ + +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(ExitZero) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-sse2-unaligned.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S new file mode 100644 index 0000000000..edd683d778 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -0,0 +1,867 @@ +/* strcat with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef STRCAT +# define STRCAT __strcat_ssse3 +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + + +/* Inline corresponding strlen file, temporary until new strcpy + implementation gets merged. */ + + xor %eax, %eax + cmpb $0, (%rdi) + jz L(exit_tail0) + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmpb $0, 3(%rdi) + jz L(exit_tail3) + + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmpb $0, 7(%rdi) + jz L(exit_tail7) + + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmpb $0, 11(%rdi) + jz L(exit_tail11) + + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmpb $0, 15(%rdi) + jz L(exit_tail15) + pxor %xmm0, %xmm0 + lea 16(%rdi), %rcx + lea 16(%rdi), %rax + and $-16, %rax + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax + + .p2align 4 +L(aligned_64): + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %r11d + pmovmskb %xmm2, %r10d + pmovmskb %xmm3, %r9d + or %edx, %r9d + or %r11d, %r9d + or %r10d, %r9d + lea 64(%rax), %rax + jz L(aligned_64) + + test %edx, %edx + jnz L(aligned_64_exit_16) + test %r11d, %r11d + jnz L(aligned_64_exit_32) + test %r10d, %r10d + jnz L(aligned_64_exit_48) + +L(aligned_64_exit_64): + pmovmskb %xmm3, %edx + jmp L(exit) + +L(aligned_64_exit_48): + lea -16(%rax), %rax + mov %r10d, %edx + jmp L(exit) + +L(aligned_64_exit_32): + lea -32(%rax), %rax + mov %r11d, %edx + jmp L(exit) + +L(aligned_64_exit_16): + lea -48(%rax), %rax + +L(exit): + sub %rcx, %rax + test %dl, %dl + jz L(exit_high) + test $0x01, %dl + jnz L(exit_tail0) + + test $0x02, %dl + jnz L(exit_tail1) + + test $0x04, %dl + jnz L(exit_tail2) + + test $0x08, %dl + jnz L(exit_tail3) + + test $0x10, %dl + jnz L(exit_tail4) + + test $0x20, %dl + jnz L(exit_tail5) + + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax +L(exit_tail0): + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_high): + add $8, %eax + test $0x01, %dh + jnz L(exit_tail0) + + test $0x02, %dh + jnz L(exit_tail1) + + test $0x04, %dh + jnz L(exit_tail2) + + test $0x08, %dh + jnz L(exit_tail3) + + test $0x10, %dh + jnz L(exit_tail4) + + test $0x20, %dh + jnz L(exit_tail5) + + test $0x40, %dh + jnz L(exit_tail6) + add $7, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail1): + add $1, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail2): + add $2, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail3): + add $3, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail4): + add $4, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail5): + add $5, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail6): + add $6, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail7): + add $7, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail8): + add $8, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail9): + add $9, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail10): + add $10, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail11): + add $11, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail12): + add $12, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail13): + add $13, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail14): + add $14, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_tail15): + add $15, %eax + + .p2align 4 +L(StartStrcpyPart): + mov %rsi, %rcx + lea (%rdi, %rax), %rdx +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(StrncatExit0) + cmp $8, %r8 + jbe L(StrncatExit8Bytes) +# endif + cmpb $0, (%rcx) + jz L(Exit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmpb $0, 7(%rcx) + jz L(Exit8) + cmpb $0, 8(%rcx) + jz L(Exit9) +# ifdef USE_AS_STRNCAT + cmp $16, %r8 + jb L(StrncatExit15Bytes) +# endif + cmpb $0, 9(%rcx) + jz L(Exit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmpb $0, 14(%rcx) + jz L(Exit15) + cmpb $0, 15(%rcx) + jz L(Exit16) +# ifdef USE_AS_STRNCAT + cmp $16, %r8 + je L(StrncatExit16) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-ssse3.S" + + .p2align 4 +L(CopyFrom1To16Bytes): + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + movlpd (%rcx), %xmm0 + movlpd 8(%rcx), %xmm1 + movlpd %xmm0, (%rdx) + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit1): + xor %ah, %ah + movb %ah, 1(%rdx) +L(Exit1): + movb (%rcx), %al + movb %al, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit2): + xor %ah, %ah + movb %ah, 2(%rdx) +L(Exit2): + movw (%rcx), %ax + movw %ax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit3): + xor %ah, %ah + movb %ah, 3(%rdx) +L(Exit3): + movw (%rcx), %ax + movw %ax, (%rdx) + movb 2(%rcx), %al + movb %al, 2(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit4): + xor %ah, %ah + movb %ah, 4(%rdx) +L(Exit4): + mov (%rcx), %eax + mov %eax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit5): + xor %ah, %ah + movb %ah, 5(%rdx) +L(Exit5): + mov (%rcx), %eax + mov %eax, (%rdx) + movb 4(%rcx), %al + movb %al, 4(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit6): + xor %ah, %ah + movb %ah, 6(%rdx) +L(Exit6): + mov (%rcx), %eax + mov %eax, (%rdx) + movw 4(%rcx), %ax + movw %ax, 4(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit7): + xor %ah, %ah + movb %ah, 7(%rdx) +L(Exit7): + mov (%rcx), %eax + mov %eax, (%rdx) + mov 3(%rcx), %eax + mov %eax, 3(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit8): + xor %ah, %ah + movb %ah, 8(%rdx) +L(Exit8): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit9): + xor %ah, %ah + movb %ah, 9(%rdx) +L(Exit9): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movb 8(%rcx), %al + movb %al, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit10): + xor %ah, %ah + movb %ah, 10(%rdx) +L(Exit10): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movw 8(%rcx), %ax + movw %ax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit11): + xor %ah, %ah + movb %ah, 11(%rdx) +L(Exit11): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov 7(%rcx), %eax + mov %eax, 7(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit12): + xor %ah, %ah + movb %ah, 12(%rdx) +L(Exit12): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit13): + xor %ah, %ah + movb %ah, 13(%rdx) +L(Exit13): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 5(%rcx), %xmm1 + movlpd %xmm1, 5(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit14): + xor %ah, %ah + movb %ah, 14(%rdx) +L(Exit14): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 6(%rcx), %xmm1 + movlpd %xmm1, 6(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit15): + xor %ah, %ah + movb %ah, 15(%rdx) +L(Exit15): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 7(%rcx), %xmm1 + movlpd %xmm1, 7(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit16): + xor %ah, %ah + movb %ah, 16(%rdx) +L(Exit16): + movlpd (%rcx), %xmm0 + movlpd 8(%rcx), %xmm1 + movlpd %xmm0, (%rdx) + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rsi, %rcx + lea (%rsi, %rdx), %rsi + lea -9(%r8), %rdx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%rsi), %rdx + jz L(ExitHighCase2) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %r8 + je L(StrncatExit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %r8 + je L(StrncatExit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %r8 + je L(StrncatExit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %r8 + je L(StrncatExit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %r8 + je L(StrncatExit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %r8 + je L(StrncatExit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax + xor %cl, %cl + movb %cl, (%rax) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHighCase2): + test $0x01, %ah + jnz L(Exit9) + cmp $9, %r8 + je L(StrncatExit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %r8 + je L(StrncatExit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %r8 + je L(StrncatExit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %r8 + je L(StrncatExit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %r8 + je L(StrncatExit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %r8 + je L(StrncatExit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %r8 + je L(StrncatExit15) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 8(%rcx), %xmm1 + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + +L(CopyFrom1To16BytesCase2OrCase3): + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rsi, %rdx + add %rsi, %rcx + + cmp $8, %r8 + ja L(ExitHighCase3) + cmp $1, %r8 + je L(StrncatExit1) + cmp $2, %r8 + je L(StrncatExit2) + cmp $3, %r8 + je L(StrncatExit3) + cmp $4, %r8 + je L(StrncatExit4) + cmp $5, %r8 + je L(StrncatExit5) + cmp $6, %r8 + je L(StrncatExit6) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + xor %ah, %ah + movb %ah, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHighCase3): + cmp $9, %r8 + je L(StrncatExit9) + cmp $10, %r8 + je L(StrncatExit10) + cmp $11, %r8 + je L(StrncatExit11) + cmp $12, %r8 + je L(StrncatExit12) + cmp $13, %r8 + je L(StrncatExit13) + cmp $14, %r8 + je L(StrncatExit14) + cmp $15, %r8 + je L(StrncatExit15) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 8(%rcx), %xmm1 + movlpd %xmm1, 8(%rdx) + xor %ah, %ah + movb %ah, 16(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit0): + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit15Bytes): + cmp $9, %r8 + je L(StrncatExit9) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmp $10, %r8 + je L(StrncatExit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmp $11, %r8 + je L(StrncatExit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmp $12, %r8 + je L(StrncatExit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmp $13, %r8 + je L(StrncatExit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmp $14, %r8 + je L(StrncatExit14) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 7(%rcx), %xmm1 + movlpd %xmm1, 7(%rdx) + lea 14(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax + xor %cl, %cl + movb %cl, (%rax) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit8Bytes): + cmpb $0, (%rcx) + jz L(Exit1) + cmp $1, %r8 + je L(StrncatExit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmp $2, %r8 + je L(StrncatExit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmp $3, %r8 + je L(StrncatExit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmp $4, %r8 + je L(StrncatExit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmp $5, %r8 + je L(StrncatExit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmp $6, %r8 + je L(StrncatExit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax + xor %cl, %cl + movb %cl, (%rax) + mov %rdi, %rax + ret + +# endif +END (STRCAT) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S new file mode 100644 index 0000000000..0e0e5dda9c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcat.S @@ -0,0 +1,85 @@ +/* Multiple versions of strcat + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef USE_AS_STRNCAT +# ifndef STRCAT +# define STRCAT strcat +# endif +#endif + +#ifdef USE_AS_STRNCAT +# define STRCAT_SSSE3 __strncat_ssse3 +# define STRCAT_SSE2 __strncat_sse2 +# define STRCAT_SSE2_UNALIGNED __strncat_sse2_unaligned +# define __GI_STRCAT __GI_strncat +# define __GI___STRCAT __GI___strncat +#else +# define STRCAT_SSSE3 __strcat_ssse3 +# define STRCAT_SSE2 __strcat_sse2 +# define STRCAT_SSE2_UNALIGNED __strcat_sse2_unaligned +# define __GI_STRCAT __GI_strcat +# define __GI___STRCAT __GI___strcat +#endif + + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(STRCAT) + .type STRCAT, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq STRCAT_SSE2_UNALIGNED(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + leaq STRCAT_SSE2(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jz 2f + leaq STRCAT_SSSE3(%rip), %rax +2: ret +END(STRCAT) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCAT_SSE2, @function; \ + .align 16; \ + .globl STRCAT_SSE2; \ + .hidden STRCAT_SSE2; \ + STRCAT_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCAT_SSE2, .-STRCAT_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcat calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCAT; __GI_STRCAT = STRCAT_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCAT; __GI___STRCAT = STRCAT_SSE2 +#endif + +#ifndef USE_AS_STRNCAT +# include "../strcat.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S b/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S new file mode 100644 index 0000000000..cbbd0b33d3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S @@ -0,0 +1,280 @@ +/* strchr with SSE2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# include "asm-syntax.h" + + atom_text_section +ENTRY (__strchr_sse2_no_bsf) + movd %esi, %xmm1 + movq %rdi, %rcx + punpcklbw %xmm1, %xmm1 + andq $~15, %rdi + pxor %xmm2, %xmm2 + punpcklbw %xmm1, %xmm1 + orl $0xffffffff, %esi + movdqa (%rdi), %xmm0 + pshufd $0, %xmm1, %xmm1 + subq %rdi, %rcx + movdqa %xmm0, %xmm3 + leaq 16(%rdi), %rdi + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm3 + shl %cl, %esi + pmovmskb %xmm0, %eax + pmovmskb %xmm3, %edx + andl %esi, %eax + andl %esi, %edx + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + +L(loop): + movdqa (%rdi), %xmm0 + leaq 16(%rdi), %rdi + movdqa %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm3 + pmovmskb %xmm0, %eax + pmovmskb %xmm3, %edx + or %eax, %edx + jz L(loop) + + pmovmskb %xmm3, %edx + test %eax, %eax + jnz L(matches) + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +L(matches): + /* There is a match. First find where NULL is. */ + leaq -16(%rdi), %rdi + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_high_case2) + + mov %al, %cl + and $15, %cl + jnz L(match_case2_4) + + mov %dl, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %al + jnz L(Exit5) + test $0x10, %dl + jnz L(return_null) + test $0x20, %al + jnz L(Exit6) + test $0x20, %dl + jnz L(return_null) + test $0x40, %al + jnz L(Exit7) + test $0x40, %dl + jnz L(return_null) + lea 7(%rdi), %rax + ret + + .p2align 4 +L(match_case2_4): + test $0x01, %al + jnz L(Exit1) + test $0x01, %dl + jnz L(return_null) + test $0x02, %al + jnz L(Exit2) + test $0x02, %dl + jnz L(return_null) + test $0x04, %al + jnz L(Exit3) + test $0x04, %dl + jnz L(return_null) + lea 3(%rdi), %rax + ret + + .p2align 4 +L(match_high_case2): + test %dl, %dl + jnz L(return_null) + + mov %ah, %cl + and $15, %cl + jnz L(match_case2_12) + + mov %dh, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %ah + jnz L(Exit13) + test $0x10, %dh + jnz L(return_null) + test $0x20, %ah + jnz L(Exit14) + test $0x20, %dh + jnz L(return_null) + test $0x40, %ah + jnz L(Exit15) + test $0x40, %dh + jnz L(return_null) + lea 15(%rdi), %rax + ret + + .p2align 4 +L(match_case2_12): + test $0x01, %ah + jnz L(Exit9) + test $0x01, %dh + jnz L(return_null) + test $0x02, %ah + jnz L(Exit10) + test $0x02, %dh + jnz L(return_null) + test $0x04, %ah + jnz L(Exit11) + test $0x04, %dh + jnz L(return_null) + lea 11(%rdi), %rax + ret + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_high_case1) + + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + lea 7(%rdi), %rax + ret + + .p2align 4 +L(match_high_case1): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + lea 15(%rdi), %rax + ret + + .p2align 4 +L(Exit1): + lea (%rdi), %rax + ret + + .p2align 4 +L(Exit2): + lea 1(%rdi), %rax + ret + + .p2align 4 +L(Exit3): + lea 2(%rdi), %rax + ret + + .p2align 4 +L(Exit4): + lea 3(%rdi), %rax + ret + + .p2align 4 +L(Exit5): + lea 4(%rdi), %rax + ret + + .p2align 4 +L(Exit6): + lea 5(%rdi), %rax + ret + + .p2align 4 +L(Exit7): + lea 6(%rdi), %rax + ret + + .p2align 4 +L(Exit9): + lea 8(%rdi), %rax + ret + + .p2align 4 +L(Exit10): + lea 9(%rdi), %rax + ret + + .p2align 4 +L(Exit11): + lea 10(%rdi), %rax + ret + + .p2align 4 +L(Exit12): + lea 11(%rdi), %rax + ret + + .p2align 4 +L(Exit13): + lea 12(%rdi), %rax + ret + + .p2align 4 +L(Exit14): + lea 13(%rdi), %rax + ret + + .p2align 4 +L(Exit15): + lea 14(%rdi), %rax + ret + +END (__strchr_sse2_no_bsf) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S b/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S new file mode 100644 index 0000000000..c9f54ca2e2 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strchr.S @@ -0,0 +1,57 @@ +/* Multiple versions of strchr + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(strchr) + .type strchr, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __strchr_sse2(%rip), %rax +2: HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + leaq __strchr_sse2_no_bsf(%rip), %rax +3: ret +END(strchr) + + + +# undef ENTRY +# define ENTRY(name) \ + .type __strchr_sse2, @function; \ + .align 16; \ + .globl __strchr_sse2; \ + .hidden __strchr_sse2; \ + __strchr_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strchr_sse2, .-__strchr_sse2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strchr calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strchr; __GI_strchr = __strchr_sse2 +#endif + +#include "../strchr.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S new file mode 100644 index 0000000000..b0992dce39 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -0,0 +1,213 @@ +/* strcmp with unaligned loads + Copyright (C) 2013-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include "sysdep.h" + +ENTRY ( __strcmp_sse2_unaligned) + movl %edi, %eax + xorl %edx, %edx + pxor %xmm7, %xmm7 + orl %esi, %eax + andl $4095, %eax + cmpl $4032, %eax + jg L(cross_page) + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pminub %xmm1, %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + testq %rax, %rax + je L(next_48_bytes) +L(return): + bsfq %rax, %rdx + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + ret + + .p2align 4 +L(next_48_bytes): + movdqu 16(%rdi), %xmm6 + movdqu 16(%rsi), %xmm3 + movdqu 32(%rdi), %xmm5 + pcmpeqb %xmm6, %xmm3 + movdqu 32(%rsi), %xmm2 + pminub %xmm6, %xmm3 + pcmpeqb %xmm1, %xmm3 + movdqu 48(%rdi), %xmm4 + pcmpeqb %xmm5, %xmm2 + pmovmskb %xmm3, %edx + movdqu 48(%rsi), %xmm0 + pminub %xmm5, %xmm2 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm2, %eax + salq $16, %rdx + pminub %xmm4, %xmm0 + pcmpeqb %xmm1, %xmm0 + salq $32, %rax + orq %rdx, %rax + pmovmskb %xmm0, %ecx + movq %rcx, %rdx + salq $48, %rdx + orq %rdx, %rax + jne L(return) +L(main_loop_header): + leaq 64(%rdi), %rdx + movl $4096, %ecx + pxor %xmm9, %xmm9 + andq $-64, %rdx + subq %rdi, %rdx + leaq (%rdi, %rdx), %rax + addq %rsi, %rdx + movq %rdx, %rsi + andl $4095, %esi + subq %rsi, %rcx + shrq $6, %rcx + movq %rcx, %rsi + jmp L(loop_start) + + .p2align 4 +L(loop): + addq $64, %rax + addq $64, %rdx +L(loop_start): + testq %rsi, %rsi + leaq -1(%rsi), %rsi + je L(loop_cross_page) +L(back_to_loop): + movdqu (%rdx), %xmm0 + movdqu 16(%rdx), %xmm1 + movdqa (%rax), %xmm2 + movdqa 16(%rax), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqu 32(%rdx), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqu 48(%rdx), %xmm6 + pminub %xmm3, %xmm1 + movdqa 32(%rax), %xmm2 + pminub %xmm1, %xmm0 + movdqa 48(%rax), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + pminub %xmm5, %xmm0 + pminub %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %ecx + testl %ecx, %ecx + je L(loop) + pcmpeqb %xmm7, %xmm5 + movdqu (%rdx), %xmm0 + pcmpeqb %xmm7, %xmm1 + movdqa (%rax), %xmm2 + pcmpeqb %xmm2, %xmm0 + pminub %xmm2, %xmm0 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rcx + orq %rdi, %rcx + salq $48, %rsi + orq %rsi, %rcx + bsfq %rcx, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + .p2align 4 +L(loop_cross_page): + xor %r10, %r10 + movq %rdx, %r9 + and $63, %r9 + subq %r9, %r10 + + movdqa (%rdx, %r10), %xmm0 + movdqa 16(%rdx, %r10), %xmm1 + movdqu (%rax, %r10), %xmm2 + movdqu 16(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm0 + movdqa 32(%rdx, %r10), %xmm5 + pcmpeqb %xmm3, %xmm1 + pminub %xmm2, %xmm0 + movdqa 48(%rdx, %r10), %xmm6 + pminub %xmm3, %xmm1 + movdqu 32(%rax, %r10), %xmm2 + movdqu 48(%rax, %r10), %xmm3 + pcmpeqb %xmm2, %xmm5 + pcmpeqb %xmm3, %xmm6 + pminub %xmm2, %xmm5 + pminub %xmm3, %xmm6 + + pcmpeqb %xmm7, %xmm0 + pcmpeqb %xmm7, %xmm1 + pcmpeqb %xmm7, %xmm5 + pcmpeqb %xmm7, %xmm6 + + pmovmskb %xmm1, %ecx + pmovmskb %xmm5, %r8d + pmovmskb %xmm0, %edi + salq $16, %rcx + salq $32, %r8 + pmovmskb %xmm6, %esi + orq %r8, %rdi + orq %rcx, %rdi + salq $48, %rsi + orq %rsi, %rdi + movq %r9, %rcx + movq $63, %rsi + shrq %cl, %rdi + test %rdi, %rdi + je L(back_to_loop) + bsfq %rdi, %rcx + movzbl (%rax, %rcx), %eax + movzbl (%rdx, %rcx), %edx + subl %edx, %eax + ret + + .p2align 4 +L(cross_page_loop): + cmpb %cl, %al + jne L(different) + addq $1, %rdx + cmpq $64, %rdx + je L(main_loop_header) +L(cross_page): + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx + testb %al, %al + jne L(cross_page_loop) + xorl %eax, %eax +L(different): + subl %ecx, %eax + ret +END (__strcmp_sse2_unaligned) + +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S new file mode 100644 index 0000000000..ed26d4a8fb --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-sse42.S @@ -0,0 +1,1792 @@ +/* strcmp with SSE4.2 + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +/* We use 0x1a: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_EACH + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to find out if two 16byte data elements are the same + and the offset of the first different byte. There are 4 cases: + + 1. Both 16byte data elements are valid and identical. + 2. Both 16byte data elements have EOS and identical. + 3. Both 16byte data elements are valid and they differ at offset X. + 4. At least one 16byte data element has EOS at offset X. Two 16byte + data elements must differ at or before offset X. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: + + case ECX CFlag ZFlag SFlag + 1 16 0 0 0 + 2 16 0 1 1 + 3 X 1 0 0 + 4 0 <= X 1 0/1 0/1 + + We exit from the loop for cases 2, 3 and 4 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for + case 2. */ + + /* Put all SSE 4.2 functions together. */ + .section .text.SECTION,"ax",@progbits + .align 16 + .type STRCMP_SSE42, @function + .globl STRCMP_SSE42 + .hidden STRCMP_SSE42 +#ifdef USE_AS_STRCASECMP_L +ENTRY (GLABEL(__strcasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + + // XXX 5 byte should be before the function + /* 5-byte NOP. */ + .byte 0x0f,0x1f,0x44,0x00,0x00 +END (GLABEL(__strcasecmp)) + /* FALLTHROUGH to strcasecmp_l. */ +#endif +#ifdef USE_AS_STRNCASECMP_L +ENTRY (GLABEL(__strncasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + + // XXX 5 byte should be before the function + /* 5-byte NOP. */ + .byte 0x0f,0x1f,0x44,0x00,0x00 +END (GLABEL(__strncasecmp)) + /* FALLTHROUGH to strncasecmp_l. */ +#endif + + +#ifdef USE_AVX +# define movdqa vmovdqa +# define movdqu vmovdqu +# define pmovmskb vpmovmskb +# define pcmpistri vpcmpistri +# define psubb vpsubb +# define pcmpeqb vpcmpeqb +# define psrldq vpsrldq +# define pslldq vpslldq +# define palignr vpalignr +# define pxor vpxor +# define D(arg) arg, arg +#else +# define D(arg) arg +#endif + +STRCMP_SSE42: + cfi_startproc + CALL_MCOUNT + +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +#ifdef USE_AS_STRCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP +# else + mov (%rdx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strcasecmp_l_nonascii +#endif +#ifdef USE_AS_STRNCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP +# else + mov (%rcx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strncasecmp_l_nonascii +#endif + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + test %rdx, %rdx + je LABEL(strcmp_exitz) + cmp $1, %rdx + je LABEL(Byte0) + mov %rdx, %r11 +#endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +LABEL(belowupper): + .quad 0x4040404040404040 + .quad 0x4040404040404040 +LABEL(topupper): +# ifdef USE_AVX + .quad 0x5a5a5a5a5a5a5a5a + .quad 0x5a5a5a5a5a5a5a5a +# else + .quad 0x5b5b5b5b5b5b5b5b + .quad 0x5b5b5b5b5b5b5b5b +# endif +LABEL(touppermask): + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + movdqa LABEL(belowupper)(%rip), %xmm4 +# define UCLOW_reg %xmm4 + movdqa LABEL(topupper)(%rip), %xmm5 +# define UCHIGH_reg %xmm5 + movdqa LABEL(touppermask)(%rip), %xmm6 +# define LCQWORD_reg %xmm6 +#endif + cmp $0x30, %ecx + ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */ + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef USE_AVX +# define TOLOWER(reg1, reg2) \ + vpcmpgtb UCLOW_reg, reg1, %xmm7; \ + vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ + vpcmpgtb UCLOW_reg, reg2, %xmm9; \ + vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ + vpandn %xmm7, %xmm8, %xmm8; \ + vpandn %xmm9, %xmm10, %xmm10; \ + vpand LCQWORD_reg, %xmm8, %xmm8; \ + vpand LCQWORD_reg, %xmm10, %xmm10; \ + vpor reg1, %xmm8, reg1; \ + vpor reg2, %xmm10, reg2 +# else +# define TOLOWER(reg1, reg2) \ + movdqa reg1, %xmm7; \ + movdqa UCHIGH_reg, %xmm8; \ + movdqa reg2, %xmm9; \ + movdqa UCHIGH_reg, %xmm10; \ + pcmpgtb UCLOW_reg, %xmm7; \ + pcmpgtb reg1, %xmm8; \ + pcmpgtb UCLOW_reg, %xmm9; \ + pcmpgtb reg2, %xmm10; \ + pand %xmm8, %xmm7; \ + pand %xmm10, %xmm9; \ + pand LCQWORD_reg, %xmm7; \ + pand LCQWORD_reg, %xmm9; \ + por %xmm7, reg1; \ + por %xmm9, reg2 +# endif + TOLOWER (%xmm1, %xmm2) +#else +# define TOLOWER(reg1, reg2) +#endif + pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ + pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ + psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes)/* If not, find different value or null char */ +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz)/* finish comparison */ +#endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte + * alignment. Use relative offset difference between the two to + * determine which case below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + lea 15(%rax), %r9 + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ + lea (%r10, %r9), %r10 + jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +LABEL(ashr_0): + + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ +#else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ +#endif + psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + .p2align 4 +LABEL(ashr_0_use): + movdqa (%rdi,%rdx), %xmm0 +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + lea 16(%rdx), %rdx + jbe LABEL(ashr_0_exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + movdqa (%rdi,%rdx), %xmm0 +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + lea 16(%rdx), %rdx + jbe LABEL(ashr_0_exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + jmp LABEL(ashr_0_use) + + + .p2align 4 +LABEL(ashr_0_exit_use): + jnc LABEL(strcmp_exitz) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rcx, %r11 + jbe LABEL(strcmp_exitz) +#endif + lea -16(%rdx, %rcx), %rcx + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %edx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx + movl (%rcx,%rax,4), %eax + movl (%rcx,%rdx,4), %edx +#endif + sub %edx, %eax + ret + + + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +LABEL(ashr_1): + pslldq $15, D(%xmm2) /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ + psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_1_use): + add $16, %r10 + jg LABEL(nibble_ashr_1_use) + +LABEL(nibble_ashr_1_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $1, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_1_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $1, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_1_use) + + .p2align 4 +LABEL(nibble_ashr_1_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $1, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $14, %ecx + ja LABEL(nibble_ashr_1_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +LABEL(ashr_2): + pslldq $14, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_2_use): + add $16, %r10 + jg LABEL(nibble_ashr_2_use) + +LABEL(nibble_ashr_2_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $2, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_2_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $2, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_2_use) + + .p2align 4 +LABEL(nibble_ashr_2_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $2, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $13, %ecx + ja LABEL(nibble_ashr_2_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +LABEL(ashr_3): + pslldq $13, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + +LABEL(loop_ashr_3_use): + add $16, %r10 + jg LABEL(nibble_ashr_3_use) + +LABEL(nibble_ashr_3_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $3, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_3_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $3, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_3_use) + + .p2align 4 +LABEL(nibble_ashr_3_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $3, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $12, %ecx + ja LABEL(nibble_ashr_3_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +LABEL(ashr_4): + pslldq $12, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_4_use): + add $16, %r10 + jg LABEL(nibble_ashr_4_use) + +LABEL(nibble_ashr_4_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $4, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_4_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $4, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_4_use) + + .p2align 4 +LABEL(nibble_ashr_4_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $4, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $11, %ecx + ja LABEL(nibble_ashr_4_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +LABEL(ashr_5): + pslldq $11, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_5_use): + add $16, %r10 + jg LABEL(nibble_ashr_5_use) + +LABEL(nibble_ashr_5_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $5, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_5_use) + + movdqa (%rdi, %rdx), %xmm0 + + palignr $5, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_5_use) + + .p2align 4 +LABEL(nibble_ashr_5_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $5, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $10, %ecx + ja LABEL(nibble_ashr_5_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + */ + .p2align 4 +LABEL(ashr_6): + pslldq $10, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_6_use): + add $16, %r10 + jg LABEL(nibble_ashr_6_use) + +LABEL(nibble_ashr_6_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $6, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_6_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $6, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_6_use) + + .p2align 4 +LABEL(nibble_ashr_6_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $6, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $9, %ecx + ja LABEL(nibble_ashr_6_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + */ + .p2align 4 +LABEL(ashr_7): + pslldq $9, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_7_use): + add $16, %r10 + jg LABEL(nibble_ashr_7_use) + +LABEL(nibble_ashr_7_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $7, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_7_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $7, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_7_use) + + .p2align 4 +LABEL(nibble_ashr_7_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $7, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $8, %ecx + ja LABEL(nibble_ashr_7_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + */ + .p2align 4 +LABEL(ashr_8): + pslldq $8, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_8_use): + add $16, %r10 + jg LABEL(nibble_ashr_8_use) + +LABEL(nibble_ashr_8_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $8, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_8_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $8, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_8_use) + + .p2align 4 +LABEL(nibble_ashr_8_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $8, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $7, %ecx + ja LABEL(nibble_ashr_8_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + */ + .p2align 4 +LABEL(ashr_9): + pslldq $7, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_9_use): + add $16, %r10 + jg LABEL(nibble_ashr_9_use) + +LABEL(nibble_ashr_9_restart_use): + movdqa (%rdi, %rdx), %xmm0 + + palignr $9, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_9_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $9, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_9_use) + + .p2align 4 +LABEL(nibble_ashr_9_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $9, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $6, %ecx + ja LABEL(nibble_ashr_9_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + */ + .p2align 4 +LABEL(ashr_10): + pslldq $6, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_10_use): + add $16, %r10 + jg LABEL(nibble_ashr_10_use) + +LABEL(nibble_ashr_10_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $10, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_10_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $10, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_10_use) + + .p2align 4 +LABEL(nibble_ashr_10_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $10, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $5, %ecx + ja LABEL(nibble_ashr_10_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + */ + .p2align 4 +LABEL(ashr_11): + pslldq $5, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_11_use): + add $16, %r10 + jg LABEL(nibble_ashr_11_use) + +LABEL(nibble_ashr_11_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $11, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_11_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $11, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_11_use) + + .p2align 4 +LABEL(nibble_ashr_11_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $11, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $4, %ecx + ja LABEL(nibble_ashr_11_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + */ + .p2align 4 +LABEL(ashr_12): + pslldq $4, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_12_use): + add $16, %r10 + jg LABEL(nibble_ashr_12_use) + +LABEL(nibble_ashr_12_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $12, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_12_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $12, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_12_use) + + .p2align 4 +LABEL(nibble_ashr_12_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $12, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $3, %ecx + ja LABEL(nibble_ashr_12_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + */ + .p2align 4 +LABEL(ashr_13): + pslldq $3, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_13_use): + add $16, %r10 + jg LABEL(nibble_ashr_13_use) + +LABEL(nibble_ashr_13_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $13, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_13_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $13, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_13_use) + + .p2align 4 +LABEL(nibble_ashr_13_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $13, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $2, %ecx + ja LABEL(nibble_ashr_13_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + */ + .p2align 4 +LABEL(ashr_14): + pslldq $2, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_14_use): + add $16, %r10 + jg LABEL(nibble_ashr_14_use) + +LABEL(nibble_ashr_14_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $14, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_14_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $14, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_14_use) + + .p2align 4 +LABEL(nibble_ashr_14_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $14, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $1, %ecx + ja LABEL(nibble_ashr_14_restart_use) + + jmp LABEL(nibble_ashr_exit_use) + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + */ + .p2align 4 +LABEL(ashr_15): + pslldq $1, D(%xmm2) + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, D(%xmm2) + psubb %xmm0, D(%xmm2) + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + + sub $0x1000, %r10 /* subtract 4K pagesize */ + + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ + + .p2align 4 +LABEL(loop_ashr_15_use): + add $16, %r10 + jg LABEL(nibble_ashr_15_use) + +LABEL(nibble_ashr_15_restart_use): + movdqa (%rdi, %rdx), %xmm0 + palignr $15, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rdx + add $16, %r10 + jg LABEL(nibble_ashr_15_use) + + movdqa (%rdi, %rdx), %xmm0 + palignr $15, -16(%rdi, %rdx), D(%xmm0) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + jbe LABEL(exit_use) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rdx + jmp LABEL(loop_ashr_15_use) + + .p2align 4 +LABEL(nibble_ashr_15_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 + psrldq $15, D(%xmm0) + pcmpistri $0x3a,%xmm0, %xmm0 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx + jae LABEL(nibble_ashr_exit_use) +#endif + cmp $0, %ecx + ja LABEL(nibble_ashr_15_restart_use) + +LABEL(nibble_ashr_exit_use): +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 +#else + movdqa (%rsi,%rdx), %xmm1 + TOLOWER (%xmm0, %xmm1) + pcmpistri $0x1a, %xmm1, %xmm0 +#endif + .p2align 4 +LABEL(exit_use): + jnc LABEL(strcmp_exitz) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rcx, %r11 + jbe LABEL(strcmp_exitz) +#endif + add %rcx, %rdx + lea -16(%rdi, %r9), %rdi + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %edx + test %r8d, %r8d + jz LABEL(ret_use) + xchg %eax, %edx +LABEL(ret_use): +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx + movl (%rcx,%rdx,4), %edx + movl (%rcx,%rax,4), %eax +#endif + + sub %edx, %eax + ret + +LABEL(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz LABEL(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +LABEL(ret): +LABEL(less16bytes): + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %rdx, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzbl (%rsi, %rdx), %ecx + movzbl (%rdi, %rdx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +#endif + + sub %ecx, %eax + ret + +LABEL(strcmp_exitz): + xor %eax, %eax + ret + + .p2align 4 + // XXX Same as code above +LABEL(Byte0): + movzx (%rsi), %ecx + movzx (%rdi), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx + movl (%rdx,%rcx,4), %ecx + movl (%rdx,%rax,4), %eax +#endif + + sub %ecx, %eax + ret + cfi_endproc + .size STRCMP_SSE42, .-STRCMP_SSE42 + +#undef UCLOW_reg +#undef UCHIGH_reg +#undef LCQWORD_reg +#undef TOLOWER + + /* Put all SSE 4.2 functions together. */ + .section .rodata.SECTION,"a",@progbits + .p2align 3 +LABEL(unaligned_table): + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .int LABEL(ashr_0) - LABEL(unaligned_table) + +#undef LABEL +#undef GLABEL +#undef SECTION +#undef movdqa +#undef movdqu +#undef pmovmskb +#undef pcmpistri +#undef psubb +#undef pcmpeqb +#undef psrldq +#undef pslldq +#undef palignr +#undef pxor +#undef D diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S new file mode 100644 index 0000000000..1b7fa33c91 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp-ssse3.S @@ -0,0 +1,5 @@ +#if IS_IN (libc) +# define USE_SSSE3 1 +# define STRCMP __strcmp_ssse3 +# include "../strcmp.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S new file mode 100644 index 0000000000..54f8f7dd44 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcmp.S @@ -0,0 +1,209 @@ +/* Multiple versions of strcmp + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifdef USE_AS_STRNCMP +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +# define STRCMP_SSE42 __strncmp_sse42 +# define STRCMP_SSSE3 __strncmp_ssse3 +# define STRCMP_SSE2 __strncmp_sse2 +# define __GI_STRCMP __GI_strncmp +#elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" + +# define UPDATE_STRNCMP_COUNTER + +# define STRCMP_AVX __strcasecmp_l_avx +# define STRCMP_SSE42 __strcasecmp_l_sse42 +# define STRCMP_SSSE3 __strcasecmp_l_ssse3 +# define STRCMP_SSE2 __strcasecmp_l_sse2 +# define __GI_STRCMP __GI___strcasecmp_l +#elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" + +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +# define STRCMP_AVX __strncasecmp_l_avx +# define STRCMP_SSE42 __strncasecmp_l_sse42 +# define STRCMP_SSSE3 __strncasecmp_l_ssse3 +# define STRCMP_SSE2 __strncasecmp_l_sse2 +# define __GI_STRCMP __GI___strncasecmp_l +#else +# define USE_AS_STRCMP +# define UPDATE_STRNCMP_COUNTER +# ifndef STRCMP +# define STRCMP strcmp +# define STRCMP_SSE42 __strcmp_sse42 +# define STRCMP_SSSE3 __strcmp_ssse3 +# define STRCMP_SSE2 __strcmp_sse2 +# define __GI_STRCMP __GI_strcmp +# endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncmp in static library since we + need strncmp before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc) + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX +#ifdef USE_AS_STRCMP + leaq __strcmp_sse2_unaligned(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 3f +#else + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + leaq STRCMP_SSE42(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) + jnz 3f +#endif +2: leaq STRCMP_SSSE3(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jnz 3f + leaq STRCMP_SSE2(%rip), %rax +3: ret +END(STRCMP) + +# ifdef USE_AS_STRCASECMP_L +ENTRY(__strcasecmp) + .type __strcasecmp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __strcasecmp_avx(%rip), %rax + HAS_ARCH_FEATURE (AVX_Usable) + jnz 3f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + leaq __strcasecmp_sse42(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) + jnz 3f +2: leaq __strcasecmp_ssse3(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jnz 3f + leaq __strcasecmp_sse2(%rip), %rax +3: ret +END(__strcasecmp) +weak_alias (__strcasecmp, strcasecmp) +# endif +# ifdef USE_AS_STRNCASECMP_L +ENTRY(__strncasecmp) + .type __strncasecmp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __strncasecmp_avx(%rip), %rax + HAS_ARCH_FEATURE (AVX_Usable) + jnz 3f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + leaq __strncasecmp_sse42(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) + jnz 3f +2: leaq __strncasecmp_ssse3(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jnz 3f + leaq __strncasecmp_sse2(%rip), %rax +3: ret +END(__strncasecmp) +weak_alias (__strncasecmp, strncasecmp) +# endif + +# undef LABEL +# define LABEL(l) .L##l##_sse42 +# define GLABEL(l) l##_sse42 +# define SECTION sse4.2 +# include "strcmp-sse42.S" + + +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define LABEL(l) .L##l##_avx +# define GLABEL(l) l##_avx +# define USE_AVX 1 +# undef STRCMP_SSE42 +# define STRCMP_SSE42 STRCMP_AVX +# define SECTION avx +# include "strcmp-sse42.S" +# endif + + +# undef ENTRY +# define ENTRY(name) \ + .type STRCMP_SSE2, @function; \ + .align 16; \ + .globl STRCMP_SSE2; \ + .hidden STRCMP_SSE2; \ + STRCMP_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2 + +# ifdef USE_AS_STRCASECMP_L +# define ENTRY2(name) \ + .type __strcasecmp_sse2, @function; \ + .align 16; \ + .globl __strcasecmp_sse2; \ + .hidden __strcasecmp_sse2; \ + __strcasecmp_sse2: cfi_startproc; \ + CALL_MCOUNT +# define END2(name) \ + cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2 +# endif + +# ifdef USE_AS_STRNCASECMP_L +# define ENTRY2(name) \ + .type __strncasecmp_sse2, @function; \ + .align 16; \ + .globl __strncasecmp_sse2; \ + .hidden __strncasecmp_sse2; \ + __strncasecmp_sse2: cfi_startproc; \ + CALL_MCOUNT +# define END2(name) \ + cfi_endproc; .size __strncasecmp_sse2, .-__strncasecmp_sse2 +# endif + +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcmp calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2 +#endif + +#include "../strcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S new file mode 100644 index 0000000000..6a5ab7ab26 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S @@ -0,0 +1,1889 @@ +/* strcpy with SSE2 and unaligned load + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include <sysdep.h> + +# ifndef STRCPY +# define STRCPY __strcpy_sse2_unaligned +# endif + +# endif + +# define JMPTBL(I, B) I - B +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + lea (%r11, %rcx), %rcx; \ + jmp *%rcx + +# ifndef USE_AS_STRCAT + +.text +ENTRY (STRCPY) +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 + test %r8, %r8 + jz L(ExitZero) +# endif + mov %rsi, %rcx +# ifndef USE_AS_STPCPY + mov %rdi, %rax /* save result */ +# endif + +# endif + + and $63, %rcx + cmp $32, %rcx + jbe L(SourceStringAlignmentLess32) + + and $-16, %rsi + and $15, %rcx + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%rsi), %xmm1 + pmovmskb %xmm1, %rdx + shr %cl, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + mov $16, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# else + mov $17, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# endif + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY + add $16, %r10 + cmp %r10, %r8 + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes) + + movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%rdi) + +/* If source address alignment != destination address alignment */ + .p2align 4 +L(Unalign16Both): + sub %rcx, %rdi +# ifdef USE_AS_STRNCPY + add %rcx, %r8 + sbb %rcx, %rcx + or %rcx, %r8 +# endif + mov $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $48, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm2) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm3) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm4 + movdqu %xmm3, (%rdi, %rcx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm4) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm1 + movdqu %xmm4, (%rdi, %rcx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm1) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm2) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm3) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + movdqu %xmm3, (%rdi, %rcx) + mov %rsi, %rdx + lea 16(%rsi, %rcx), %rsi + and $-0x40, %rsi + sub %rsi, %rdx + sub %rdx, %rdi +# ifdef USE_AS_STRNCPY + lea 128(%r8, %rdx), %r8 +# endif +L(Unaligned64Loop): + movaps (%rsi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rsi), %xmm5 + movaps 32(%rsi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rsi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %rdx, %rdx + jnz L(Unaligned64Leave) + +L(Unaligned64Loop_start): + add $64, %rdi + add $64, %rsi + movdqu %xmm4, -64(%rdi) + movaps (%rsi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%rdi) + movaps 16(%rsi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%rsi), %xmm3 + movdqu %xmm6, -32(%rdi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%rdi) + movaps 48(%rsi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %rdx, %rdx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %rcx, %rcx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 48(%rdi, %rdx), %rax +# endif + movdqu %xmm7, 48(%rdi) + add $15, %r8 + sub %rdx, %r8 + lea 49(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $48, %rsi + add $48, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + +/* If source address alignment == destination address alignment */ + +L(SourceStringAlignmentLess32): + pxor %xmm0, %xmm0 + movdqu (%rsi), %xmm1 + movdqu 16(%rsi), %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $16, %r8 +# else + cmp $17, %r8 +# endif + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb %xmm2, %xmm0 + movdqu %xmm1, (%rdi) + pmovmskb %xmm0, %rdx + +# ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $32, %r8 +# else + cmp $33, %r8 +# endif + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes1) + + and $-16, %rsi + and $15, %rcx + jmp L(Unalign16Both) + +/*------End of main part with loops---------------------*/ + +/* Case1 */ + +# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) + .p2align 4 +L(CopyFrom1To16Bytes): + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + .p2align 4 +L(CopyFrom1To16BytesTail): + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %rsi + add $16, %rdi +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $16, %r8 +# endif +L(CopyFrom1To16BytesTail1): + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %rdx, %rdx + add %rcx, %rsi + add $16, %rdx + sub %rcx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %rdx, %rdx +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + movdqu %xmm4, (%rdi) + add $63, %r8 + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 16(%rdi, %rdx), %rax +# endif + movdqu %xmm5, 16(%rdi) + add $47, %r8 + sub %rdx, %r8 + lea 17(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $16, %rsi + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %rdx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 32(%rdi, %rdx), %rax +# endif + movdqu %xmm6, 32(%rdi) + add $31, %r8 + sub %rdx, %r8 + lea 33(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +# else + add $32, %rsi + add $32, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +# endif + +# ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) +# endif + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + add %rcx, %rsi + bsf %rdx, %rdx + add $16, %rdx + sub %rcx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTailCase2): + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To32BytesCase2) + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTailCase2) + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %rdi + add $16, %rsi + sub $16, %r8 +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +# endif + +/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/ + + .p2align 4 +L(Exit1): + mov %dh, (%rdi) +# ifdef USE_AS_STPCPY + lea (%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $1, %r8 + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit2): + mov (%rsi), %dx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $2, %r8 + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit3): + mov (%rsi), %cx + mov %cx, (%rdi) + mov %dh, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $3, %r8 + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit4): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $4, %r8 + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit5): + mov (%rsi), %ecx + mov %dh, 4(%rdi) + mov %ecx, (%rdi) +# ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $5, %r8 + lea 5(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $6, %r8 + lea 6(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +# ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $7, %r8 + lea 7(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +# ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $8, %r8 + lea 8(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit9): + mov (%rsi), %rcx + mov %dh, 8(%rdi) + mov %rcx, (%rdi) +# ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $9, %r8 + lea 9(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $10, %r8 + lea 10(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $11, %r8 + lea 11(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $12, %r8 + lea 12(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +# ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $13, %r8 + lea 13(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +# ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $14, %r8 + lea 14(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $15, %r8 + lea 15(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +# ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $16, %r8 + lea 16(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit17): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) + mov %dh, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $17, %r8 + lea 17(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $18, %r8 + lea 18(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $19, %r8 + lea 19(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $20, %r8 + lea 20(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dh, 20(%rdi) +# ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $21, %r8 + lea 21(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $22, %r8 + lea 22(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $23, %r8 + lea 23(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $24, %r8 + lea 24(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) + mov %dh, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $25, %r8 + lea 25(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $26, %r8 + lea 26(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +# ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $27, %r8 + lea 27(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $28, %r8 + lea 28(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +# ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $29, %r8 + lea 29(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $30, %r8 + lea 30(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $31, %r8 + lea 31(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + + .p2align 4 +L(Exit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +# endif +# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $32, %r8 + lea 32(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit0): +# ifdef USE_AS_STPCPY + mov %rdi, %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, (%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit1): + mov (%rsi), %dl + mov %dl, (%rdi) +# ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 1(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit2): + mov (%rsi), %dx + mov %dx, (%rdi) +# ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 2(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit3): + mov (%rsi), %cx + mov 2(%rsi), %dl + mov %cx, (%rdi) + mov %dl, 2(%rdi) +# ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 3(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit4): + mov (%rsi), %edx + mov %edx, (%rdi) +# ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 4(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit5): + mov (%rsi), %ecx + mov 4(%rsi), %dl + mov %ecx, (%rdi) + mov %dl, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 5(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +# ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 6(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +# ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 7(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +# ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 8(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit9): + mov (%rsi), %rcx + mov 8(%rsi), %dl + mov %rcx, (%rdi) + mov %dl, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 9(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 10(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 11(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +# ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 12(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +# ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 13(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +# ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 14(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +# ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 15(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +# ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 16(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit17): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %cl, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 17(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 18(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 19(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 20(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + mov 20(%rsi), %dl + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dl, 20(%rdi) +# ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 21(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 22(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 23(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 24(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cl, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 25(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 26(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +# ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 27(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +# ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 28(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +# ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 29(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +# ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 30(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +# ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 31(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +# ifdef USE_AS_STPCPY + lea 32(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 32(%rdi) +# endif + ret + + .p2align 4 +L(StrncpyExit33): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + mov 32(%rsi), %cl + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) + mov %cl, 32(%rdi) +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 33(%rdi) +# endif + ret + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(Fill0): + ret + + .p2align 4 +L(Fill1): + mov %dl, (%rdi) + ret + + .p2align 4 +L(Fill2): + mov %dx, (%rdi) + ret + + .p2align 4 +L(Fill3): + mov %edx, -1(%rdi) + ret + + .p2align 4 +L(Fill4): + mov %edx, (%rdi) + ret + + .p2align 4 +L(Fill5): + mov %edx, (%rdi) + mov %dl, 4(%rdi) + ret + + .p2align 4 +L(Fill6): + mov %edx, (%rdi) + mov %dx, 4(%rdi) + ret + + .p2align 4 +L(Fill7): + mov %rdx, -1(%rdi) + ret + + .p2align 4 +L(Fill8): + mov %rdx, (%rdi) + ret + + .p2align 4 +L(Fill9): + mov %rdx, (%rdi) + mov %dl, 8(%rdi) + ret + + .p2align 4 +L(Fill10): + mov %rdx, (%rdi) + mov %dx, 8(%rdi) + ret + + .p2align 4 +L(Fill11): + mov %rdx, (%rdi) + mov %edx, 7(%rdi) + ret + + .p2align 4 +L(Fill12): + mov %rdx, (%rdi) + mov %edx, 8(%rdi) + ret + + .p2align 4 +L(Fill13): + mov %rdx, (%rdi) + mov %rdx, 5(%rdi) + ret + + .p2align 4 +L(Fill14): + mov %rdx, (%rdi) + mov %rdx, 6(%rdi) + ret + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%rdi) + ret + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%rdi) + ret + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%rdi, %rcx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %rdx, %rdx + add $15, %r8 + add %rcx, %rdi +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %rdx, %rdx + sub $16, %r8 + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%rdi) + add $16, %rdi + + mov %rdi, %rsi + and $0xf, %rsi + sub %rsi, %rdi + add %rsi, %r8 + sub $64, %r8 + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + add $64, %rdi + sub $64, %r8 + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %r8 + jl L(StrncpyFillLess32) + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + add $32, %rdi + sub $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillLess32): + add $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillExit): + add $16, %r8 + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +/* end of ifndef USE_AS_STRCAT */ +# endif + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %rdx, %rdx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%r8), %rcx + and $-16, %rcx + add $48, %r8 + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%rdi) +# ifdef USE_AS_STPCPY + lea 64(%rdi), %rax +# endif +# ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 64(%rdi) +# endif + ret + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %rcx, %rcx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $48, %r8 + jle L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +# ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm4) +# else + jnz L(CopyFrom1To16Bytes) +# endif + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm4, (%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +# ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm5) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm5, 16(%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +# ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm6) +# else + jnz L(CopyFrom1To16Bytes) +# endif + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm6, 32(%rdi) + lea 16(%rdi, %rcx), %rdi + lea 16(%rsi, %rcx), %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(ExitZero): +# ifndef USE_AS_STRCAT + mov %rdi, %rax +# endif + ret + +# endif + +# ifndef USE_AS_STRCAT +END (STRCPY) +# else +END (STRCAT) +# endif + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +# ifdef USE_AS_STRNCPY +L(ExitStrncpyTable): + .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) +# ifndef USE_AS_STRCAT + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +# endif +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S new file mode 100644 index 0000000000..47aaeae671 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy-ssse3.S @@ -0,0 +1,3551 @@ +/* strcpy with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include <sysdep.h> + +# ifndef STRCPY +# define STRCPY __strcpy_ssse3 +# endif + + .section .text.ssse3,"ax",@progbits +ENTRY (STRCPY) + + mov %rsi, %rcx +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 +# endif + mov %rdi, %rdx +# ifdef USE_AS_STRNCPY + test %r8, %r8 + jz L(Exit0) + cmp $8, %r8 + jbe L(StrncpyExit8Bytes) +# endif + cmpb $0, (%rcx) + jz L(Exit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmpb $0, 7(%rcx) + jz L(Exit8) +# ifdef USE_AS_STRNCPY + cmp $16, %r8 + jb L(StrncpyExit15Bytes) +# endif + cmpb $0, 8(%rcx) + jz L(Exit9) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmpb $0, 14(%rcx) + jz L(Exit15) +# ifdef USE_AS_STRNCPY + cmp $16, %r8 + je L(Exit16) +# endif + cmpb $0, 15(%rcx) + jz L(Exit16) +# endif + +# ifdef USE_AS_STRNCPY + mov %rcx, %rsi + sub $16, %r8 + and $0xf, %rsi + +/* add 16 bytes rcx_offset to r8 */ + + add %rsi, %r8 +# endif + lea 16(%rcx), %rsi + and $-16, %rsi + pxor %xmm0, %xmm0 + mov (%rcx), %r9 + mov %r9, (%rdx) + pcmpeqb (%rsi), %xmm0 + mov 8(%rcx), %r9 + mov %r9, 8(%rdx) + +/* convert byte mask in xmm0 to bit mask */ + + pmovmskb %xmm0, %rax + sub %rcx, %rsi + +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov %rdx, %rax + lea 16(%rdx), %rdx + and $-16, %rdx + sub %rdx, %rax + +# ifdef USE_AS_STRNCPY + add %rax, %rsi + lea -1(%rsi), %rsi + and $1<<31, %esi + test %rsi, %rsi + jnz L(ContinueCopy) + lea 16(%r8), %r8 + +L(ContinueCopy): +# endif + sub %rax, %rcx + mov %rcx, %rax + and $0xf, %rax + mov $0, %rsi + +/* case: rcx_offset == rdx_offset */ + + jz L(Align16Both) + + cmp $8, %rax + jae L(ShlHigh8) + cmp $1, %rax + je L(Shl1) + cmp $2, %rax + je L(Shl2) + cmp $3, %rax + je L(Shl3) + cmp $4, %rax + je L(Shl4) + cmp $5, %rax + je L(Shl5) + cmp $6, %rax + je L(Shl6) + jmp L(Shl7) + +L(ShlHigh8): + je L(Shl8) + cmp $9, %rax + je L(Shl9) + cmp $10, %rax + je L(Shl10) + cmp $11, %rax + je L(Shl11) + cmp $12, %rax + je L(Shl12) + cmp $13, %rax + je L(Shl13) + cmp $14, %rax + je L(Shl14) + jmp L(Shl15) + +L(Align16Both): + movaps (%rcx), %xmm1 + movaps 16(%rcx), %xmm2 + movaps %xmm1, (%rdx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm4 + movaps %xmm3, (%rdx, %rsi) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm1 + movaps %xmm4, (%rdx, %rsi) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm2 + movaps %xmm1, (%rdx, %rsi) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%rdx, %rsi) + mov %rcx, %rax + lea 16(%rcx, %rsi), %rcx + and $-0x40, %rcx + sub %rcx, %rax + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + lea 112(%r8, %rax), %r8 +# endif + mov $-0x40, %rsi + + .p2align 4 +L(Aligned64Loop): + movaps (%rcx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rcx), %xmm5 + movaps 32(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rcx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rax + lea 64(%rdx), %rdx + lea 64(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeaveCase2OrCase3) +# endif + test %rax, %rax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%rdx) + movaps %xmm5, -48(%rdx) + movaps %xmm6, -32(%rdx) + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): +# ifdef USE_AS_STRNCPY + lea 48(%r8), %r8 +# endif + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%rdx) + pcmpeqb %xmm7, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%r8), %r8 +# endif + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl1): + movaps -1(%rcx), %xmm1 + movaps 15(%rcx), %xmm2 +L(Shl1Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl1LoopExit) + + palignr $1, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 31(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -15(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -1(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl1LoopStart): + movaps 15(%rcx), %xmm2 + movaps 31(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 47(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 63(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + test %rax, %rax + palignr $1, %xmm3, %xmm4 + jnz L(Shl1Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave1) +# endif + palignr $1, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $1, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl1LoopStart) + +L(Shl1LoopExit): + movdqu -1(%rcx), %xmm1 + mov $15, %rsi + movdqu %xmm1, -1(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl2): + movaps -2(%rcx), %xmm1 + movaps 14(%rcx), %xmm2 +L(Shl2Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl2LoopExit) + + palignr $2, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 30(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -14(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -2(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl2LoopStart): + movaps 14(%rcx), %xmm2 + movaps 30(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 46(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 62(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + test %rax, %rax + palignr $2, %xmm3, %xmm4 + jnz L(Shl2Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave2) +# endif + palignr $2, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $2, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl2LoopStart) + +L(Shl2LoopExit): + movdqu -2(%rcx), %xmm1 + mov $14, %rsi + movdqu %xmm1, -2(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl3): + movaps -3(%rcx), %xmm1 + movaps 13(%rcx), %xmm2 +L(Shl3Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl3LoopExit) + + palignr $3, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 29(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -13(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -3(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl3LoopStart): + movaps 13(%rcx), %xmm2 + movaps 29(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 45(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 61(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + test %rax, %rax + palignr $3, %xmm3, %xmm4 + jnz L(Shl3Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave3) +# endif + palignr $3, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $3, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl3LoopStart) + +L(Shl3LoopExit): + movdqu -3(%rcx), %xmm1 + mov $13, %rsi + movdqu %xmm1, -3(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl4): + movaps -4(%rcx), %xmm1 + movaps 12(%rcx), %xmm2 +L(Shl4Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 28(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -12(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -4(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl4LoopStart): + movaps 12(%rcx), %xmm2 + movaps 28(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %rax, %rax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave4) +# endif + palignr $4, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movdqu -4(%rcx), %xmm1 + mov $12, %rsi + movdqu %xmm1, -4(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl5): + movaps -5(%rcx), %xmm1 + movaps 11(%rcx), %xmm2 +L(Shl5Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl5LoopExit) + + palignr $5, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 27(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -11(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -5(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl5LoopStart): + movaps 11(%rcx), %xmm2 + movaps 27(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 43(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 59(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + test %rax, %rax + palignr $5, %xmm3, %xmm4 + jnz L(Shl5Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave5) +# endif + palignr $5, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $5, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl5LoopStart) + +L(Shl5LoopExit): + movdqu -5(%rcx), %xmm1 + mov $11, %rsi + movdqu %xmm1, -5(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl6): + movaps -6(%rcx), %xmm1 + movaps 10(%rcx), %xmm2 +L(Shl6Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl6LoopExit) + + palignr $6, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 26(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -10(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -6(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl6LoopStart): + movaps 10(%rcx), %xmm2 + movaps 26(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 42(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 58(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + test %rax, %rax + palignr $6, %xmm3, %xmm4 + jnz L(Shl6Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave6) +# endif + palignr $6, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $6, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl6LoopStart) + +L(Shl6LoopExit): + mov (%rcx), %r9 + mov 6(%rcx), %esi + mov %r9, (%rdx) + mov %esi, 6(%rdx) + mov $10, %rsi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl7): + movaps -7(%rcx), %xmm1 + movaps 9(%rcx), %xmm2 +L(Shl7Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl7LoopExit) + + palignr $7, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 25(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -9(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -7(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl7LoopStart): + movaps 9(%rcx), %xmm2 + movaps 25(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 41(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 57(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + test %rax, %rax + palignr $7, %xmm3, %xmm4 + jnz L(Shl7Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave7) +# endif + palignr $7, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $7, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl7LoopStart) + +L(Shl7LoopExit): + mov (%rcx), %r9 + mov 5(%rcx), %esi + mov %r9, (%rdx) + mov %esi, 5(%rdx) + mov $9, %rsi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%rcx), %xmm1 + movaps 8(%rcx), %xmm2 +L(Shl8Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 24(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -8(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -8(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl8LoopStart): + movaps 8(%rcx), %xmm2 + movaps 24(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %rax, %rax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave8) +# endif + palignr $8, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + mov (%rcx), %r9 + mov $8, %rsi + mov %r9, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl9): + movaps -9(%rcx), %xmm1 + movaps 7(%rcx), %xmm2 +L(Shl9Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl9LoopExit) + + palignr $9, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 23(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -7(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -9(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl9LoopStart): + movaps 7(%rcx), %xmm2 + movaps 23(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 39(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 55(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + test %rax, %rax + palignr $9, %xmm3, %xmm4 + jnz L(Shl9Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave9) +# endif + palignr $9, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $9, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl9LoopStart) + +L(Shl9LoopExit): + mov -1(%rcx), %r9 + mov $7, %rsi + mov %r9, -1(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl10): + movaps -10(%rcx), %xmm1 + movaps 6(%rcx), %xmm2 +L(Shl10Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl10LoopExit) + + palignr $10, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 22(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -6(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -10(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl10LoopStart): + movaps 6(%rcx), %xmm2 + movaps 22(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 38(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 54(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + test %rax, %rax + palignr $10, %xmm3, %xmm4 + jnz L(Shl10Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave10) +# endif + palignr $10, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $10, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl10LoopStart) + +L(Shl10LoopExit): + mov -2(%rcx), %r9 + mov $6, %rsi + mov %r9, -2(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl11): + movaps -11(%rcx), %xmm1 + movaps 5(%rcx), %xmm2 +L(Shl11Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl11LoopExit) + + palignr $11, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 21(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -5(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -11(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl11LoopStart): + movaps 5(%rcx), %xmm2 + movaps 21(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 37(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 53(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + test %rax, %rax + palignr $11, %xmm3, %xmm4 + jnz L(Shl11Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave11) +# endif + palignr $11, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $11, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl11LoopStart) + +L(Shl11LoopExit): + mov -3(%rcx), %r9 + mov $5, %rsi + mov %r9, -3(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%rcx), %xmm1 + movaps 4(%rcx), %xmm2 +L(Shl12Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 20(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -4(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -12(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl12LoopStart): + movaps 4(%rcx), %xmm2 + movaps 20(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %rax, %rax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave12) +# endif + palignr $12, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + mov (%rcx), %r9d + mov $4, %rsi + mov %r9d, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl13): + movaps -13(%rcx), %xmm1 + movaps 3(%rcx), %xmm2 +L(Shl13Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl13LoopExit) + + palignr $13, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 19(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -3(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -13(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl13LoopStart): + movaps 3(%rcx), %xmm2 + movaps 19(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 35(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 51(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + test %rax, %rax + palignr $13, %xmm3, %xmm4 + jnz L(Shl13Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave13) +# endif + palignr $13, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $13, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl13LoopStart) + +L(Shl13LoopExit): + mov -1(%rcx), %r9d + mov $3, %rsi + mov %r9d, -1(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl14): + movaps -14(%rcx), %xmm1 + movaps 2(%rcx), %xmm2 +L(Shl14Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl14LoopExit) + + palignr $14, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 18(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -2(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -14(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl14LoopStart): + movaps 2(%rcx), %xmm2 + movaps 18(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 34(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 50(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + test %rax, %rax + palignr $14, %xmm3, %xmm4 + jnz L(Shl14Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave14) +# endif + palignr $14, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $14, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl14LoopStart) + +L(Shl14LoopExit): + mov -2(%rcx), %r9d + mov $2, %rsi + mov %r9d, -2(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl15): + movaps -15(%rcx), %xmm1 + movaps 1(%rcx), %xmm2 +L(Shl15Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx +# ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %rax, %rax + jnz L(Shl15LoopExit) + + palignr $15, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 17(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -1(%rcx), %rcx + sub %rax, %rdx +# ifdef USE_AS_STRNCPY + add %rax, %r8 +# endif + movaps -15(%rcx), %xmm1 + +/* 64 bytes loop */ + .p2align 4 +L(Shl15LoopStart): + movaps 1(%rcx), %xmm2 + movaps 17(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 33(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 49(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + test %rax, %rax + palignr $15, %xmm3, %xmm4 + jnz L(Shl15Start) +# ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(StrncpyLeave15) +# endif + palignr $15, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $15, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl15LoopStart) + +L(Shl15LoopExit): + mov -3(%rcx), %r9d + mov $1, %rsi + mov %r9d, -3(%rdx) +# ifdef USE_AS_STRCAT + jmp L(CopyFrom1To16Bytes) +# endif + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(CopyFrom1To16Bytes): +# ifdef USE_AS_STRNCPY + add $16, %r8 +# endif + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + + .p2align 4 +L(Exit8): + mov (%rcx), %rax + mov %rax, (%rdx) +# ifdef USE_AS_STPCPY + lea 7(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $8, %r8 + lea 8(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + + .p2align 4 +L(Exit16): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) +# ifdef USE_AS_STPCPY + lea 15(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $16, %r8 + lea 16(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rsi, %rcx + lea (%rsi, %rdx), %rsi + lea -9(%r8), %rdx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%rsi), %rdx + jz L(ExitHighCase2) + + cmp $1, %r8 + je L(Exit1) + test $0x01, %al + jnz L(Exit1) + cmp $2, %r8 + je L(Exit2) + test $0x02, %al + jnz L(Exit2) + cmp $3, %r8 + je L(Exit3) + test $0x04, %al + jnz L(Exit3) + cmp $4, %r8 + je L(Exit4) + test $0x08, %al + jnz L(Exit4) + cmp $5, %r8 + je L(Exit5) + test $0x10, %al + jnz L(Exit5) + cmp $6, %r8 + je L(Exit6) + test $0x20, %al + jnz L(Exit6) + cmp $7, %r8 + je L(Exit7) + test $0x40, %al + jnz L(Exit7) + jmp L(Exit8) + + .p2align 4 +L(ExitHighCase2): + cmp $9, %r8 + je L(Exit9) + test $0x01, %ah + jnz L(Exit9) + cmp $10, %r8 + je L(Exit10) + test $0x02, %ah + jnz L(Exit10) + cmp $11, %r8 + je L(Exit11) + test $0x04, %ah + jnz L(Exit11) + cmp $12, %r8 + je L(Exit12) + test $0x8, %ah + jnz L(Exit12) + cmp $13, %r8 + je L(Exit13) + test $0x10, %ah + jnz L(Exit13) + cmp $14, %r8 + je L(Exit14) + test $0x20, %ah + jnz L(Exit14) + cmp $15, %r8 + je L(Exit15) + test $0x40, %ah + jnz L(Exit15) + jmp L(Exit16) + +L(CopyFrom1To16BytesCase2OrCase3): + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rsi, %rdx + add %rsi, %rcx + + cmp $16, %r8 + je L(Exit16) + cmp $8, %r8 + je L(Exit8) + jg L(More8Case3) + cmp $4, %r8 + je L(Exit4) + jg L(More4Case3) + cmp $2, %r8 + jl L(Exit1) + je L(Exit2) + jg L(Exit3) +L(More8Case3): /* but less than 16 */ + cmp $12, %r8 + je L(Exit12) + jl L(Less12Case3) + cmp $14, %r8 + jl L(Exit13) + je L(Exit14) + jg L(Exit15) +L(More4Case3): /* but less than 8 */ + cmp $6, %r8 + jl L(Exit5) + je L(Exit6) + jg L(Exit7) +L(Less12Case3): /* but more than 8 */ + cmp $10, %r8 + jl L(Exit9) + je L(Exit10) + jg L(Exit11) +# endif + + .p2align 4 +L(Exit1): + movb (%rcx), %al + movb %al, (%rdx) +# ifdef USE_AS_STPCPY + lea (%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $1, %r8 + lea 1(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit2): + movw (%rcx), %ax + movw %ax, (%rdx) +# ifdef USE_AS_STPCPY + lea 1(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $2, %r8 + lea 2(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit3): + movw (%rcx), %ax + movw %ax, (%rdx) + movb 2(%rcx), %al + movb %al, 2(%rdx) +# ifdef USE_AS_STPCPY + lea 2(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $3, %r8 + lea 3(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit4): + movl (%rcx), %eax + movl %eax, (%rdx) +# ifdef USE_AS_STPCPY + lea 3(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $4, %r8 + lea 4(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit5): + movl (%rcx), %eax + movl %eax, (%rdx) + movb 4(%rcx), %al + movb %al, 4(%rdx) +# ifdef USE_AS_STPCPY + lea 4(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $5, %r8 + lea 5(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit6): + movl (%rcx), %eax + movl %eax, (%rdx) + movw 4(%rcx), %ax + movw %ax, 4(%rdx) +# ifdef USE_AS_STPCPY + lea 5(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $6, %r8 + lea 6(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit7): + movl (%rcx), %eax + movl %eax, (%rdx) + movl 3(%rcx), %eax + movl %eax, 3(%rdx) +# ifdef USE_AS_STPCPY + lea 6(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $7, %r8 + lea 7(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit9): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 5(%rcx), %eax + mov %eax, 5(%rdx) +# ifdef USE_AS_STPCPY + lea 8(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $9, %r8 + lea 9(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit10): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 6(%rcx), %eax + mov %eax, 6(%rdx) +# ifdef USE_AS_STPCPY + lea 9(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $10, %r8 + lea 10(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit11): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %eax + mov %eax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 10(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $11, %r8 + lea 11(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit12): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) +# ifdef USE_AS_STPCPY + lea 11(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $12, %r8 + lea 12(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit13): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 5(%rcx), %rax + mov %rax, 5(%rdx) +# ifdef USE_AS_STPCPY + lea 12(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $13, %r8 + lea 13(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit14): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 6(%rcx), %rax + mov %rax, 6(%rdx) +# ifdef USE_AS_STPCPY + lea 13(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $14, %r8 + lea 14(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + + .p2align 4 +L(Exit15): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %rax + mov %rax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 14(%rdx), %rax +# else + mov %rdi, %rax +# endif +# ifdef USE_AS_STRNCPY + sub $15, %r8 + lea 15(%rdx), %rcx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +# endif +# endif + ret + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(Fill0): + ret + + .p2align 4 +L(Fill1): + movb %dl, (%rcx) + ret + + .p2align 4 +L(Fill2): + movw %dx, (%rcx) + ret + + .p2align 4 +L(Fill3): + movw %dx, (%rcx) + movb %dl, 2(%rcx) + ret + + .p2align 4 +L(Fill4): + movl %edx, (%rcx) + ret + + .p2align 4 +L(Fill5): + movl %edx, (%rcx) + movb %dl, 4(%rcx) + ret + + .p2align 4 +L(Fill6): + movl %edx, (%rcx) + movw %dx, 4(%rcx) + ret + + .p2align 4 +L(Fill7): + movl %edx, (%rcx) + movl %edx, 3(%rcx) + ret + + .p2align 4 +L(Fill8): + mov %rdx, (%rcx) + ret + + .p2align 4 +L(Fill9): + mov %rdx, (%rcx) + movb %dl, 8(%rcx) + ret + + .p2align 4 +L(Fill10): + mov %rdx, (%rcx) + movw %dx, 8(%rcx) + ret + + .p2align 4 +L(Fill11): + mov %rdx, (%rcx) + movl %edx, 7(%rcx) + ret + + .p2align 4 +L(Fill12): + mov %rdx, (%rcx) + movl %edx, 8(%rcx) + ret + + .p2align 4 +L(Fill13): + mov %rdx, (%rcx) + mov %rdx, 5(%rcx) + ret + + .p2align 4 +L(Fill14): + mov %rdx, (%rcx) + mov %rdx, 6(%rcx) + ret + + .p2align 4 +L(Fill15): + mov %rdx, (%rcx) + mov %rdx, 7(%rcx) + ret + + .p2align 4 +L(Fill16): + mov %rdx, (%rcx) + mov %rdx, 8(%rcx) + ret + + .p2align 4 +L(StrncpyFillExit1): + lea 16(%r8), %r8 +L(FillFrom1To16Bytes): + test %r8, %r8 + jz L(Fill0) + cmp $16, %r8 + je L(Fill16) + cmp $8, %r8 + je L(Fill8) + jg L(FillMore8) + cmp $4, %r8 + je L(Fill4) + jg L(FillMore4) + cmp $2, %r8 + jl L(Fill1) + je L(Fill2) + jg L(Fill3) +L(FillMore8): /* but less than 16 */ + cmp $12, %r8 + je L(Fill12) + jl L(FillLess12) + cmp $14, %r8 + jl L(Fill13) + je L(Fill14) + jg L(Fill15) +L(FillMore4): /* but less than 8 */ + cmp $6, %r8 + jl L(Fill5) + je L(Fill6) + jg L(Fill7) +L(FillLess12): /* but more than 8 */ + cmp $10, %r8 + jl L(Fill9) + je L(Fill10) + jmp L(Fill11) + + .p2align 4 +L(StrncpyFillTailWithZero1): + xor %rdx, %rdx + sub $16, %r8 + jbe L(StrncpyFillExit1) + + pxor %xmm0, %xmm0 + mov %rdx, (%rcx) + mov %rdx, 8(%rcx) + + lea 16(%rcx), %rcx + + mov %rcx, %rdx + and $0xf, %rdx + sub %rdx, %rcx + add %rdx, %r8 + xor %rdx, %rdx + sub $64, %r8 + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + movdqa %xmm0, 32(%rcx) + movdqa %xmm0, 48(%rcx) + lea 64(%rcx), %rcx + sub $64, %r8 + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %r8 + jl L(StrncpyFillLess32) + movdqa %xmm0, (%rcx) + movdqa %xmm0, 16(%rcx) + lea 32(%rcx), %rcx + sub $16, %r8 + jl L(StrncpyFillExit1) + movdqa %xmm0, (%rcx) + lea 16(%rcx), %rcx + jmp L(FillFrom1To16Bytes) + +L(StrncpyFillLess32): + add $16, %r8 + jl L(StrncpyFillExit1) + movdqa %xmm0, (%rcx) + lea 16(%rcx), %rcx + jmp L(FillFrom1To16Bytes) + + .p2align 4 +L(Exit0): + mov %rdx, %rax + ret + + .p2align 4 +L(StrncpyExit15Bytes): + cmp $9, %r8 + je L(Exit9) + cmpb $0, 8(%rcx) + jz L(Exit9) + cmp $10, %r8 + je L(Exit10) + cmpb $0, 9(%rcx) + jz L(Exit10) + cmp $11, %r8 + je L(Exit11) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmp $12, %r8 + je L(Exit12) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmp $13, %r8 + je L(Exit13) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmp $14, %r8 + je L(Exit14) + cmpb $0, 13(%rcx) + jz L(Exit14) + mov (%rcx), %rax + mov %rax, (%rdx) + mov 7(%rcx), %rax + mov %rax, 7(%rdx) +# ifdef USE_AS_STPCPY + lea 14(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax +# else + mov %rdi, %rax +# endif + ret + + .p2align 4 +L(StrncpyExit8Bytes): + cmp $1, %r8 + je L(Exit1) + cmpb $0, (%rcx) + jz L(Exit1) + cmp $2, %r8 + je L(Exit2) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmp $3, %r8 + je L(Exit3) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmp $4, %r8 + je L(Exit4) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmp $5, %r8 + je L(Exit5) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmp $6, %r8 + je L(Exit6) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmp $7, %r8 + je L(Exit7) + cmpb $0, 6(%rcx) + jz L(Exit7) + mov (%rcx), %rax + mov %rax, (%rdx) +# ifdef USE_AS_STPCPY + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax +# else + mov %rdi, %rax +# endif + ret + +# endif +# endif + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(StrncpyLeaveCase2OrCase3): + test %rax, %rax + jnz L(Aligned64LeaveCase2) + +L(Aligned64LeaveCase3): + lea 64(%r8), %r8 + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm4, -64(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm5, -48(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm6, -32(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + jmp L(CopyFrom1To16BytesCase3) + +L(Aligned64LeaveCase2): + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rax + add $48, %r8 + jle L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm6, -32(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + jmp L(CopyFrom1To16BytesCase2) +/*--------------------------------------------------*/ + .p2align 4 +L(StrncpyExit1Case2OrCase3): + movdqu -1(%rcx), %xmm0 + movdqu %xmm0, -1(%rdx) + mov $15, %rsi + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit2Case2OrCase3): + movdqu -2(%rcx), %xmm0 + movdqu %xmm0, -2(%rdx) + mov $14, %rsi + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit3Case2OrCase3): + movdqu -3(%rcx), %xmm0 + movdqu %xmm0, -3(%rdx) + mov $13, %rsi + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit4Case2OrCase3): + movdqu -4(%rcx), %xmm0 + movdqu %xmm0, -4(%rdx) + mov $12, %rsi + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit5Case2OrCase3): + movdqu -5(%rcx), %xmm0 + movdqu %xmm0, -5(%rdx) + mov $11, %rsi + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit6Case2OrCase3): + mov (%rcx), %rsi + mov 6(%rcx), %r9d + mov %r9d, 6(%rdx) + mov %rsi, (%rdx) + test %rax, %rax + mov $10, %rsi + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit7Case2OrCase3): + mov (%rcx), %rsi + mov 5(%rcx), %r9d + mov %r9d, 5(%rdx) + mov %rsi, (%rdx) + test %rax, %rax + mov $9, %rsi + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit8Case2OrCase3): + mov (%rcx), %r9 + mov $8, %rsi + mov %r9, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit9Case2OrCase3): + mov -1(%rcx), %r9 + mov $7, %rsi + mov %r9, -1(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit10Case2OrCase3): + mov -2(%rcx), %r9 + mov $6, %rsi + mov %r9, -2(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit11Case2OrCase3): + mov -3(%rcx), %r9 + mov $5, %rsi + mov %r9, -3(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit12Case2OrCase3): + mov (%rcx), %r9d + mov $4, %rsi + mov %r9d, (%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit13Case2OrCase3): + mov -1(%rcx), %r9d + mov $3, %rsi + mov %r9d, -1(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit14Case2OrCase3): + mov -2(%rcx), %r9d + mov $2, %rsi + mov %r9d, -2(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit15Case2OrCase3): + mov -3(%rcx), %r9d + mov $1, %rsi + mov %r9d, -3(%rdx) + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave1): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 31(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit1) + palignr $1, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit1) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit1) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit1): + lea 15(%rdx, %rsi), %rdx + lea 15(%rcx, %rsi), %rcx + mov -15(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -15(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave2): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 30(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit2) + palignr $2, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit2) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit2) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit2): + lea 14(%rdx, %rsi), %rdx + lea 14(%rcx, %rsi), %rcx + mov -14(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -14(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave3): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 29(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit3) + palignr $3, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit3) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit3) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit3): + lea 13(%rdx, %rsi), %rdx + lea 13(%rcx, %rsi), %rcx + mov -13(%rcx), %rsi + mov -8(%rcx), %rax + mov %rsi, -13(%rdx) + mov %rax, -8(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave4): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit4) + palignr $4, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit4) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit4) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit4): + lea 12(%rdx, %rsi), %rdx + lea 12(%rcx, %rsi), %rcx + mov -12(%rcx), %rsi + mov -4(%rcx), %eax + mov %rsi, -12(%rdx) + mov %eax, -4(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave5): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 27(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit5) + palignr $5, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit5) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit5) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit5): + lea 11(%rdx, %rsi), %rdx + lea 11(%rcx, %rsi), %rcx + mov -11(%rcx), %rsi + mov -4(%rcx), %eax + mov %rsi, -11(%rdx) + mov %eax, -4(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave6): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 26(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit6) + palignr $6, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit6) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit6) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit6): + lea 10(%rdx, %rsi), %rdx + lea 10(%rcx, %rsi), %rcx + mov -10(%rcx), %rsi + movw -2(%rcx), %ax + mov %rsi, -10(%rdx) + movw %ax, -2(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave7): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 25(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit7) + palignr $7, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit7) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit7) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit7): + lea 9(%rdx, %rsi), %rdx + lea 9(%rcx, %rsi), %rcx + mov -9(%rcx), %rsi + movb -1(%rcx), %ah + mov %rsi, -9(%rdx) + movb %ah, -1(%rdx) + xor %rsi, %rsi + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave8): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit8) + palignr $8, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit8) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit8) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit8): + lea 8(%rdx, %rsi), %rdx + lea 8(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave9): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 23(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit9) + palignr $9, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit9) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit9) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit9): + lea 7(%rdx, %rsi), %rdx + lea 7(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave10): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 22(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit10) + palignr $10, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit10) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit10) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit10): + lea 6(%rdx, %rsi), %rdx + lea 6(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave11): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 21(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit11) + palignr $11, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit11) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit11) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit11): + lea 5(%rdx, %rsi), %rdx + lea 5(%rcx, %rsi), %rcx + mov -8(%rcx), %rax + xor %rsi, %rsi + mov %rax, -8(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave12): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit12) + palignr $12, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit12) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit12) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit12): + lea 4(%rdx, %rsi), %rdx + lea 4(%rcx, %rsi), %rcx + mov -4(%rcx), %eax + xor %rsi, %rsi + mov %eax, -4(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave13): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 19(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit13) + palignr $13, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit13) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit13) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit13): + lea 3(%rdx, %rsi), %rdx + lea 3(%rcx, %rsi), %rcx + mov -4(%rcx), %eax + xor %rsi, %rsi + mov %eax, -4(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave14): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 18(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit14) + palignr $14, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit14) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit14) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit14): + lea 2(%rdx, %rsi), %rdx + lea 2(%rcx, %rsi), %rcx + movw -2(%rcx), %ax + xor %rsi, %rsi + movw %ax, -2(%rdx) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyLeave15): + movaps %xmm2, %xmm3 + add $48, %r8 + jle L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 17(%rcx), %xmm2 + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit15) + palignr $15, %xmm3, %xmm2 + movaps %xmm2, 16(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit15) + movaps %xmm4, 32(%rdx) + lea 16(%rsi), %rsi + sub $16, %r8 + jbe L(StrncpyExit15) + movaps %xmm5, 48(%rdx) + lea 16(%rsi), %rsi + lea -16(%r8), %r8 + +L(StrncpyExit15): + lea 1(%rdx, %rsi), %rdx + lea 1(%rcx, %rsi), %rcx + movb -1(%rcx), %ah + xor %rsi, %rsi + movb %ah, -1(%rdx) + jmp L(CopyFrom1To16BytesCase3) + +# endif +# ifndef USE_AS_STRCAT +END (STRCPY) +# endif +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S new file mode 100644 index 0000000000..77819ddc50 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcpy.S @@ -0,0 +1,99 @@ +/* Multiple versions of strcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define STRCPY_SSE2_UNALIGNED __stpncpy_sse2_unaligned +# define __GI_STRCPY __GI_stpncpy +# define __GI___STRCPY __GI___stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define STRCPY_SSE2_UNALIGNED __stpcpy_sse2_unaligned +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define STRCPY_SSE2_UNALIGNED __strncpy_sse2_unaligned +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define STRCPY_SSE2_UNALIGNED __strcpy_sse2_unaligned +# define __GI_STRCPY __GI_strcpy +# endif +#endif + + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq STRCPY_SSE2_UNALIGNED(%rip), %rax + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + leaq STRCPY_SSE2(%rip), %rax + HAS_CPU_FEATURE (SSSE3) + jz 2f + leaq STRCPY_SSSE3(%rip), %rax +2: ret +END(STRCPY) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_SSE2, @function; \ + .align 16; \ + .globl STRCPY_SSE2; \ + .hidden STRCPY_SSE2; \ + STRCPY_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2 +#endif + +#ifndef USE_AS_STRNCPY +#include "../strcpy.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c new file mode 100644 index 0000000000..67991b5ca7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn-c.c @@ -0,0 +1,173 @@ +/* strcspn with SSE4.2 intrinsics + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <nmmintrin.h> +#include <string.h> +#include "varshift.h" + +/* We use 0x2: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any byte A and + the offset of the first byte. There are 3 cases: + + 1. The first 16byte data element has the byte A at the offset X. + 2. The first 16byte data element has EOS and doesn't have the byte A. + 3. The first 16byte data element is valid and doesn't have the byte A. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + 1 X 1 0/1 0 + 2 16 0 1 0 + 3 16 0 0 0 + + We exit from the loop for cases 1 and 2 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset + X for case 1. */ + +#ifndef STRCSPN_SSE2 +# define STRCSPN_SSE2 __strcspn_sse2 +# define STRCSPN_SSE42 __strcspn_sse42 +#endif + +#ifdef USE_AS_STRPBRK +# define RETURN(val1, val2) return val1 +#else +# define RETURN(val1, val2) return val2 +#endif + +extern +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +STRCSPN_SSE2 (const char *, const char *); + + +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +__attribute__ ((section (".text.sse4.2"))) +STRCSPN_SSE42 (const char *s, const char *a) +{ + if (*a == 0) + RETURN (NULL, strlen (s)); + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + mask = __m128i_shift_right (mask0, offset); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return STRCSPN_SSE2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. We could play games with + palignr, but frankly this data should be in L1 now + so do the merge via an unaligned load. */ + mask = _mm_loadu_si128 ((__m128i *) a); + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + value = __m128i_shift_right (value, offset); + + int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ + int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x2); + int cflag = _mm_cmpistrc (mask, value, 0x2); + int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) + RETURN (NULL, + /* Find where the NULL terminator is. */ + (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); + aligned += 16; + } +} diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S new file mode 100644 index 0000000000..d102c7e80b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strcspn.S @@ -0,0 +1,69 @@ +/* Multiple versions of strcspn + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <init-arch.h> + +#ifdef USE_AS_STRPBRK +#define STRCSPN_SSE42 __strpbrk_sse42 +#define STRCSPN_SSE2 __strpbrk_sse2 +#define __GI_STRCSPN __GI_strpbrk +#else +#ifndef STRCSPN +#define STRCSPN strcspn +#define STRCSPN_SSE42 __strcspn_sse42 +#define STRCSPN_SSE2 __strcspn_sse2 +#define __GI_STRCSPN __GI_strcspn +#endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc) + .text +ENTRY(STRCSPN) + .type STRCSPN, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq STRCSPN_SSE2(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) + jz 2f + leaq STRCSPN_SSE42(%rip), %rax +2: ret +END(STRCSPN) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCSPN_SSE2, @function; \ + .globl STRCSPN_SSE2; \ + .align 16; \ + STRCSPN_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 +#endif + +#ifdef USE_AS_STRPBRK +#include "../strpbrk.S" +#else +#include "../strcspn.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S new file mode 100644 index 0000000000..6728678688 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l-ssse3.S @@ -0,0 +1,6 @@ +#define USE_SSSE3 1 +#define USE_AS_STRNCASECMP_L +#define NO_NOLOCALE_ALIAS +#define STRCMP __strncasecmp_l_ssse3 +#define __strncasecmp __strncasecmp_ssse3 +#include "../strcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S new file mode 100644 index 0000000000..9c0149788e --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncase_l.S @@ -0,0 +1,8 @@ +/* Multiple versions of strncasecmp and strncasecmp_l + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP __strncasecmp_l +#define USE_AS_STRNCASECMP_L +#include "strcmp.S" + +weak_alias (__strncasecmp_l, strncasecmp_l) +libc_hidden_def (strncasecmp_l) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c new file mode 100644 index 0000000000..a3cdbff689 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-c.c @@ -0,0 +1,8 @@ +#define STRNCAT __strncat_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__strncat_sse2, __GI___strncat, __strncat_sse2); +#endif + +#include "string/strncat.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S new file mode 100644 index 0000000000..133e1d20b0 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_sse2_unaligned +#include "strcat-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S new file mode 100644 index 0000000000..6c45ff3ec7 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_ssse3 +#include "strcat-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S new file mode 100644 index 0000000000..5c1bf41453 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncat.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncat + All versions must be listed in ifunc-impl-list.c. */ +#define STRCAT strncat +#define USE_AS_STRNCAT +#include "strcat.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S new file mode 100644 index 0000000000..96380a46be --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp-ssse3.S @@ -0,0 +1,6 @@ +#ifdef SHARED +# define USE_SSSE3 1 +# define STRCMP __strncmp_ssse3 +# define USE_AS_STRNCMP +# include "../strcmp.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S new file mode 100644 index 0000000000..fd5eb1397c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncmp.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncmp + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP strncmp +#define USE_AS_STRNCMP +#include "strcmp.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c new file mode 100644 index 0000000000..296c32cb5d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_sse2 +#ifdef SHARED +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2); +#endif + +#include "strncpy.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S new file mode 100644 index 0000000000..fcc23a754a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-sse2-unaligned.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_sse2_unaligned +#include "strcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S new file mode 100644 index 0000000000..bf82ee447d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S new file mode 100644 index 0000000000..6d87a0ba35 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strncpy.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncpy + All versions must be listed in ifunc-impl-list.c. */ +#define STRCPY strncpy +#define USE_AS_STRNCPY +#include "strcpy.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c new file mode 100644 index 0000000000..bbf5c49d89 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk-c.c @@ -0,0 +1,8 @@ +/* Don't define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#ifdef SHARED +# define USE_AS_STRPBRK +# define STRCSPN_SSE2 __strpbrk_sse2 +# define STRCSPN_SSE42 __strpbrk_sse42 +# include "strcspn-c.c" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S new file mode 100644 index 0000000000..7201d6376f --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strpbrk.S @@ -0,0 +1,5 @@ +/* Multiple versions of strpbrk + All versions must be listed in ifunc-impl-list.c. */ +#define STRCSPN strpbrk +#define USE_AS_STRPBRK +#include "strcspn.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c new file mode 100644 index 0000000000..1704606b80 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strspn-c.c @@ -0,0 +1,145 @@ +/* strspn with SSE4.2 intrinsics + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <nmmintrin.h> +#include <string.h> +#include "varshift.h" + +/* We use 0x12: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any non-A byte and + the offset of the first byte. There are 2 cases: + + 1. The first 16byte data element has the non-A byte, including + EOS, at the offset X. + 2. The first 16byte data element is valid and doesn't have the non-A + byte. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + case ECX CFlag ZFlag SFlag + 1 X 1 0/1 0 + 2 16 0 0 0 + + We exit from the loop for case 1. */ + +extern size_t __strspn_sse2 (const char *, const char *); + + +size_t +__attribute__ ((section (".text.sse4.2"))) +__strspn_sse42 (const char *s, const char *a) +{ + if (*a == 0) + return 0; + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + mask = __m128i_shift_right (mask0, offset); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return __strspn_sse2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. We could play games with + palignr, but frankly this data should be in L1 now + so do the merge via an unaligned load. */ + mask = _mm_loadu_si128 ((__m128i *) a); + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_sse2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + value = __m128i_shift_right (value, offset); + + int length = _mm_cmpistri (mask, value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + return length; + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x12); + int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; + } +} diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S b/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S new file mode 100644 index 0000000000..adf7d9e533 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strspn.S @@ -0,0 +1,50 @@ +/* Multiple versions of strspn + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(strspn) + .type strspn, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + leaq __strspn_sse2(%rip), %rax + HAS_CPU_FEATURE (SSE4_2) + jz 2f + leaq __strspn_sse42(%rip), %rax +2: ret +END(strspn) + +# undef ENTRY +# define ENTRY(name) \ + .type __strspn_sse2, @function; \ + .globl __strspn_sse2; \ + .align 16; \ + __strspn_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 +#endif + +#include "../strspn.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S new file mode 100644 index 0000000000..138979d10a --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S @@ -0,0 +1,374 @@ +/* strstr with unaligned loads + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +ENTRY(__strstr_sse2_unaligned) + movzbl (%rsi), %eax + testb %al, %al + je L(empty) + movzbl 1(%rsi), %edx + testb %dl, %dl + je L(strchr) + movd %eax, %xmm1 + movd %edx, %xmm2 + movq %rdi, %rax + andl $4095, %eax + punpcklbw %xmm1, %xmm1 + cmpq $4031, %rax + punpcklbw %xmm2, %xmm2 + punpcklwd %xmm1, %xmm1 + punpcklwd %xmm2, %xmm2 + pshufd $0, %xmm1, %xmm1 + pshufd $0, %xmm2, %xmm2 + ja L(cross_page) + movdqu (%rdi), %xmm3 + pxor %xmm5, %xmm5 + movdqu 1(%rdi), %xmm4 + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + movdqu 16(%rdi), %xmm0 + pcmpeqb %xmm5, %xmm6 + pminub %xmm4, %xmm3 + movdqa %xmm3, %xmm4 + movdqu 17(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm5 + pcmpeqb %xmm2, %xmm3 + por %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm0 + pminub %xmm3, %xmm0 + por %xmm5, %xmm0 + pmovmskb %xmm4, %r8d + pmovmskb %xmm0, %eax + salq $16, %rax + orq %rax, %r8 + je L(next_32_bytes) +L(next_pair_index): + bsf %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero1) + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found1) + cmpb 2(%rax), %dl + jne L(next_pair) + xorl %edx, %edx + jmp L(pair_loop_start) + + .p2align 4 +L(strchr): + movzbl %al, %esi + jmp __strchr_sse2 + + .p2align 4 +L(pair_loop): + addq $1, %rdx + cmpb 2(%rax,%rdx), %cl + jne L(next_pair) +L(pair_loop_start): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop) +L(found1): + ret +L(zero1): + xorl %eax, %eax + ret + + .p2align 4 +L(next_pair): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index) + + .p2align 4 +L(next_32_bytes): + movdqu 32(%rdi), %xmm3 + pxor %xmm5, %xmm5 + movdqu 33(%rdi), %xmm4 + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + movdqu 48(%rdi), %xmm0 + pcmpeqb %xmm5, %xmm6 + pminub %xmm4, %xmm3 + movdqa %xmm3, %xmm4 + movdqu 49(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm5 + pcmpeqb %xmm2, %xmm3 + por %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm0 + pminub %xmm3, %xmm0 + por %xmm5, %xmm0 + pmovmskb %xmm4, %eax + salq $32, %rax + pmovmskb %xmm0, %r8d + salq $48, %r8 + orq %rax, %r8 + je L(loop_header) +L(next_pair2_index): + bsfq %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero2) + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found2) + cmpb 2(%rax), %dl + jne L(next_pair2) + xorl %edx, %edx + jmp L(pair_loop2_start) + + .p2align 4 +L(pair_loop2): + addq $1, %rdx + cmpb 2(%rax,%rdx), %cl + jne L(next_pair2) +L(pair_loop2_start): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop2) +L(found2): + ret + L(zero2): + xorl %eax, %eax + ret +L(empty): + mov %rdi, %rax + ret + + .p2align 4 +L(next_pair2): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair2_index) +L(loop_header): + movq $-512, %r11 + movq %rdi, %r9 + + pxor %xmm7, %xmm7 + andq $-64, %rdi + + .p2align 4 +L(loop): + movdqa 64(%rdi), %xmm3 + movdqu 63(%rdi), %xmm6 + movdqa %xmm3, %xmm0 + pxor %xmm2, %xmm3 + pxor %xmm1, %xmm6 + movdqa 80(%rdi), %xmm10 + por %xmm3, %xmm6 + pminub %xmm10, %xmm0 + movdqu 79(%rdi), %xmm3 + pxor %xmm2, %xmm10 + pxor %xmm1, %xmm3 + movdqa 96(%rdi), %xmm9 + por %xmm10, %xmm3 + pminub %xmm9, %xmm0 + pxor %xmm2, %xmm9 + movdqa 112(%rdi), %xmm8 + addq $64, %rdi + pminub %xmm6, %xmm3 + movdqu 31(%rdi), %xmm4 + pminub %xmm8, %xmm0 + pxor %xmm2, %xmm8 + pxor %xmm1, %xmm4 + por %xmm9, %xmm4 + pminub %xmm4, %xmm3 + movdqu 47(%rdi), %xmm5 + pxor %xmm1, %xmm5 + por %xmm8, %xmm5 + pminub %xmm5, %xmm3 + pminub %xmm3, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + testl %eax, %eax + je L(loop) + pminub (%rdi), %xmm6 + pminub 32(%rdi),%xmm4 + pminub 48(%rdi),%xmm5 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm5 + pmovmskb %xmm6, %edx + movdqa 16(%rdi), %xmm8 + pcmpeqb %xmm7, %xmm4 + movdqu 15(%rdi), %xmm0 + pmovmskb %xmm5, %r8d + movdqa %xmm8, %xmm3 + pmovmskb %xmm4, %ecx + pcmpeqb %xmm1,%xmm0 + pcmpeqb %xmm2,%xmm3 + salq $32, %rcx + pcmpeqb %xmm7,%xmm8 + salq $48, %r8 + pminub %xmm0,%xmm3 + orq %rcx, %rdx + por %xmm3,%xmm8 + orq %rdx, %r8 + pmovmskb %xmm8, %eax + salq $16, %rax + orq %rax, %r8 + je L(loop) +L(next_pair_index3): + bsfq %r8, %rcx + addq %rdi, %rcx + cmpb $0, (%rcx) + je L(zero) + xorl %eax, %eax + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(success3) + cmpb 1(%rcx), %dl + jne L(next_pair3) + jmp L(pair_loop_start3) + + .p2align 4 +L(pair_loop3): + addq $1, %rax + cmpb 1(%rcx,%rax), %dl + jne L(next_pair3) +L(pair_loop_start3): + movzbl 3(%rsi,%rax), %edx + testb %dl, %dl + jne L(pair_loop3) +L(success3): + lea -1(%rcx), %rax + ret + + .p2align 4 +L(next_pair3): + addq %rax, %r11 + movq %rdi, %rax + subq %r9, %rax + cmpq %r11, %rax + jl L(switch_strstr) + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index3) + jmp L(loop) + + .p2align 4 +L(switch_strstr): + movq %rdi, %rdi + jmp __strstr_sse2 + + .p2align 4 +L(cross_page): + + movq %rdi, %rax + pxor %xmm0, %xmm0 + andq $-64, %rax + movdqa (%rax), %xmm3 + movdqu -1(%rax), %xmm4 + movdqa %xmm3, %xmm8 + movdqa 16(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm0, %xmm8 + pcmpeqb %xmm2, %xmm3 + movdqa %xmm5, %xmm7 + pminub %xmm4, %xmm3 + movdqu 15(%rax), %xmm4 + pcmpeqb %xmm0, %xmm7 + por %xmm3, %xmm8 + movdqa %xmm5, %xmm3 + movdqa 32(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm3 + movdqa %xmm5, %xmm6 + pmovmskb %xmm8, %ecx + pminub %xmm4, %xmm3 + movdqu 31(%rax), %xmm4 + por %xmm3, %xmm7 + movdqa %xmm5, %xmm3 + pcmpeqb %xmm0, %xmm6 + movdqa 48(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm7, %r8d + pcmpeqb %xmm2, %xmm3 + pcmpeqb %xmm5, %xmm0 + pminub %xmm4, %xmm3 + movdqu 47(%rax), %xmm4 + por %xmm3, %xmm6 + movdqa %xmm5, %xmm3 + salq $16, %r8 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm3 + pmovmskb %xmm6, %r10d + pminub %xmm4, %xmm3 + por %xmm3, %xmm0 + salq $32, %r10 + orq %r10, %r8 + orq %rcx, %r8 + movl %edi, %ecx + pmovmskb %xmm0, %edx + subl %eax, %ecx + salq $48, %rdx + orq %rdx, %r8 + shrq %cl, %r8 + je L(loop_header) +L(next_pair_index4): + bsfq %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero) + + cmpq %rax,%rdi + je L(next_pair4) + + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found3) + cmpb 1(%rax), %dl + jne L(next_pair4) + xorl %edx, %edx + jmp L(pair_loop_start4) + + .p2align 4 +L(pair_loop4): + addq $1, %rdx + cmpb 1(%rax,%rdx), %cl + jne L(next_pair4) +L(pair_loop_start4): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop4) +L(found3): + subq $1, %rax + ret + + .p2align 4 +L(next_pair4): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index4) + jmp L(loop_header) + + .p2align 4 +L(found): + rep + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + +END(__strstr_sse2_unaligned) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c b/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c new file mode 100644 index 0000000000..a7d181d797 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/strstr.c @@ -0,0 +1,50 @@ +/* Multiple versions of strstr. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Redefine strstr so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +#undef strstr +#define strstr __redirect_strstr +#include <string.h> +#undef strstr + +#define STRSTR __strstr_sse2 +#ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2); +#endif + +#include "string/strstr.c" + +extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; +extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; + +#include "init-arch.h" + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_strstr) __libc_strstr; +libc_ifunc (__libc_strstr, + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + ? __strstr_sse2_unaligned + : __strstr_sse2) + +#undef strstr +strong_alias (__libc_strstr, strstr) diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c new file mode 100644 index 0000000000..597d64e1e8 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/test-multiarch.c @@ -0,0 +1,96 @@ +/* Test CPU feature data. + This file is part of the GNU C Library. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <cpu-features.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +static char *cpu_flags; + +/* Search for flags in /proc/cpuinfo and store line + in cpu_flags. */ +void +get_cpuinfo (void) +{ + FILE *f; + char *line = NULL; + size_t len = 0; + ssize_t read; + + f = fopen ("/proc/cpuinfo", "r"); + if (f == NULL) + { + printf ("cannot open /proc/cpuinfo\n"); + exit (1); + } + + while ((read = getline (&line, &len, f)) != -1) + { + if (strncmp (line, "flags", 5) == 0) + { + cpu_flags = strdup (line); + break; + } + } + fclose (f); + free (line); +} + +int +check_proc (const char *proc_name, int flag, const char *name) +{ + int found = 0; + + printf ("Checking %s:\n", name); + printf (" init-arch %d\n", flag); + if (strstr (cpu_flags, proc_name) != NULL) + found = 1; + printf (" cpuinfo (%s) %d\n", proc_name, found); + + if (found != flag) + printf (" *** failure ***\n"); + + return (found != flag); +} + +static int +do_test (int argc, char **argv) +{ + int fails; + + get_cpuinfo (); + fails = check_proc ("avx", HAS_ARCH_FEATURE (AVX_Usable), + "HAS_ARCH_FEATURE (AVX_Usable)"); + fails += check_proc ("fma4", HAS_ARCH_FEATURE (FMA4_Usable), + "HAS_ARCH_FEATURE (FMA4_Usable)"); + fails += check_proc ("sse4_2", HAS_CPU_FEATURE (SSE4_2), + "HAS_CPU_FEATURE (SSE4_2)"); + fails += check_proc ("sse4_1", HAS_CPU_FEATURE (SSE4_1) + , "HAS_CPU_FEATURE (SSE4_1)"); + fails += check_proc ("ssse3", HAS_CPU_FEATURE (SSSE3), + "HAS_CPU_FEATURE (SSSE3)"); + fails += check_proc ("popcnt", HAS_CPU_FEATURE (POPCOUNT), + "HAS_CPU_FEATURE (POPCOUNT)"); + + printf ("%d differences between /proc/cpuinfo and glibc code.\n", fails); + + return (fails != 0); +} + +#include "../../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c new file mode 100644 index 0000000000..1c3e34845d --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.c @@ -0,0 +1,25 @@ +/* Helper for variable shifts of SSE registers. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "varshift.h" + +const int8_t ___m128i_shift_right[31] attribute_hidden = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h new file mode 100644 index 0000000000..07bb76c4bf --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/varshift.h @@ -0,0 +1,30 @@ +/* Helper for variable shifts of SSE registers. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdint.h> +#include <tmmintrin.h> + +extern const int8_t ___m128i_shift_right[31] attribute_hidden; + +static __inline__ __m128i +__m128i_shift_right (__m128i value, unsigned long int offset) +{ + return _mm_shuffle_epi8 (value, + _mm_loadu_si128 ((__m128i *) (___m128i_shift_right + + offset))); +} diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c new file mode 100644 index 0000000000..a51a83a9be --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-c.c @@ -0,0 +1,5 @@ +#if IS_IN (libc) +# define wcscpy __wcscpy_sse2 +#endif + +#include "wcsmbs/wcscpy.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S new file mode 100644 index 0000000000..53857ce4f5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -0,0 +1,552 @@ +/* wcscpy with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> + + .section .text.ssse3,"ax",@progbits +ENTRY (__wcscpy_ssse3) + + mov %rsi, %rcx + mov %rdi, %rdx + + cmpl $0, (%rcx) + jz L(Exit4) + cmpl $0, 4(%rcx) + jz L(Exit8) + cmpl $0, 8(%rcx) + jz L(Exit12) + cmpl $0, 12(%rcx) + jz L(Exit16) + + lea 16(%rcx), %rsi + and $-16, %rsi + + pxor %xmm0, %xmm0 + mov (%rcx), %r9 + mov %r9, (%rdx) + + pcmpeqd (%rsi), %xmm0 + mov 8(%rcx), %r9 + mov %r9, 8(%rdx) + + pmovmskb %xmm0, %rax + sub %rcx, %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov %rdx, %rax + lea 16(%rdx), %rdx + and $-16, %rdx + sub %rdx, %rax + sub %rax, %rcx + mov %rcx, %rax + and $0xf, %rax + mov $0, %rsi + +/* case: rcx_offset == rdx_offset */ + + jz L(Align16Both) + + cmp $4, %rax + je L(Shl4) + cmp $8, %rax + je L(Shl8) + jmp L(Shl12) + +L(Align16Both): + movaps (%rcx), %xmm1 + movaps 16(%rcx), %xmm2 + movaps %xmm1, (%rdx) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm4 + movaps %xmm3, (%rdx, %rsi) + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm1 + movaps %xmm4, (%rdx, %rsi) + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm2 + movaps %xmm1, (%rdx, %rsi) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%rcx, %rsi), %xmm3 + movaps %xmm2, (%rdx, %rsi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%rdx, %rsi) + mov %rcx, %rax + lea 16(%rcx, %rsi), %rcx + and $-0x40, %rcx + sub %rcx, %rax + sub %rax, %rdx + + mov $-0x40, %rsi + + .p2align 4 +L(Aligned64Loop): + movaps (%rcx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rcx), %xmm5 + movaps 32(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rcx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqd %xmm0, %xmm3 + pmovmskb %xmm3, %rax + lea 64(%rdx), %rdx + lea 64(%rcx), %rcx + test %rax, %rax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%rdx) + movaps %xmm5, -48(%rdx) + movaps %xmm6, -32(%rdx) + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %rax + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm5, %xmm0 + + pmovmskb %xmm0, %rax + movaps %xmm4, -64(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm6, %xmm0 + + pmovmskb %xmm0, %rax + movaps %xmm5, -48(%rdx) + test %rax, %rax + lea 16(%rsi), %rsi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%rdx) + pcmpeqd %xmm7, %xmm0 + + pmovmskb %xmm0, %rax + lea 16(%rsi), %rsi + test %rax, %rax + jnz L(CopyFrom1To16Bytes) + + mov $-0x40, %rsi + movaps %xmm7, -16(%rdx) + jmp L(Aligned64Loop) + + .p2align 4 +L(Shl4): + movaps -4(%rcx), %xmm1 + movaps 12(%rcx), %xmm2 +L(Shl4Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 28(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + + test %rax, %rax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 28(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -12(%rcx), %rcx + sub %rax, %rdx + + movaps -4(%rcx), %xmm1 + + .p2align 4 +L(Shl4LoopStart): + movaps 12(%rcx), %xmm2 + movaps 28(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %rax, %rax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) + + palignr $4, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movdqu -4(%rcx), %xmm1 + mov $12, %rsi + movdqu %xmm1, -4(%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%rcx), %xmm1 + movaps 8(%rcx), %xmm2 +L(Shl8Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 24(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + + test %rax, %rax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 24(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -8(%rcx), %rcx + sub %rax, %rdx + + movaps -8(%rcx), %xmm1 + + .p2align 4 +L(Shl8LoopStart): + movaps 8(%rcx), %xmm2 + movaps 24(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %rax, %rax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) + + palignr $8, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + mov (%rcx), %r9 + mov $8, %rsi + mov %r9, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%rcx), %xmm1 + movaps 4(%rcx), %xmm2 +L(Shl12Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %rax + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm1 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + movaps %xmm2, %xmm3 + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%rdx) + movaps 20(%rcx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%rdx), %rdx + pmovmskb %xmm0, %rax + lea 16(%rcx), %rcx + + test %rax, %rax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%rdx) + lea 20(%rcx), %rcx + lea 16(%rdx), %rdx + + mov %rcx, %rax + and $-0x40, %rcx + sub %rcx, %rax + lea -4(%rcx), %rcx + sub %rax, %rdx + + movaps -12(%rcx), %xmm1 + + .p2align 4 +L(Shl12LoopStart): + movaps 4(%rcx), %xmm2 + movaps 20(%rcx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%rcx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%rcx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %rax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %rax, %rax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) + palignr $12, %xmm2, %xmm3 + lea 64(%rcx), %rcx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%rdx) + movaps %xmm4, 32(%rdx) + movaps %xmm3, 16(%rdx) + movaps %xmm2, (%rdx) + lea 64(%rdx), %rdx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + mov (%rcx), %r9d + mov $4, %rsi + mov %r9d, (%rdx) + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(CopyFrom1To16Bytes): + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + + mov (%rcx), %rax + mov %rax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit12) + + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit4): + movl (%rcx), %eax + movl %eax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit8): + mov (%rcx), %rax + mov %rax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit12): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(Exit16): + mov (%rcx), %rax + mov %rax, (%rdx) + mov 8(%rcx), %rax + mov %rax, 8(%rdx) + mov %rdi, %rax + ret + +END(__wcscpy_ssse3) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S new file mode 100644 index 0000000000..9150ab6d18 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcscpy.S @@ -0,0 +1,40 @@ +/* Multiple versions of wcscpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + + .text +ENTRY(wcscpy) + .type wcscpy, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_CPU_FEATURE (SSSE3) + jnz 2f + leaq __wcscpy_sse2(%rip), %rax + ret + +2: leaq __wcscpy_ssse3(%rip), %rax + ret + +END(wcscpy) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c new file mode 100644 index 0000000000..e1ec7cfbb5 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-c.c @@ -0,0 +1,9 @@ +#if IS_IN (libc) +# include <wchar.h> + +# define WCSNLEN __wcsnlen_sse2 + +extern __typeof (wcsnlen) __wcsnlen_sse2; +#endif + +#include "wcsmbs/wcsnlen.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S new file mode 100644 index 0000000000..a8cab0cb00 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S @@ -0,0 +1,5 @@ +#define AS_WCSLEN +#define AS_STRNLEN +#define strlen __wcsnlen_sse4_1 + +#include "../strlen.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c new file mode 100644 index 0000000000..304f62eec3 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wcsnlen.c @@ -0,0 +1,45 @@ +/* Multiple versions of wcsnlen. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define __wcsnlen __redirect_wcsnlen +# include <wchar.h> +# undef __wcsnlen + +# define SYMBOL_NAME wcsnlen +# include <init-arch.h> + +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + +static inline void * +IFUNC_SELECTOR (void) +{ + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4_1); + + return OPTIMIZE (sse2); +} + +libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ()); +weak_alias (__wcsnlen, wcsnlen); +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S new file mode 100644 index 0000000000..bfa1a16a35 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_avx2_movbe +#define USE_AS_WMEMCMP 1 + +#include "memcmp-avx2-movbe.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c new file mode 100644 index 0000000000..46b6715e18 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-c.c @@ -0,0 +1,9 @@ +#if IS_IN (libc) +# include <wchar.h> + +# define WMEMCMP __wmemcmp_sse2 + +extern __typeof (wmemcmp) __wmemcmp_sse2; +#endif + +#include "wcsmbs/wmemcmp.c" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S new file mode 100644 index 0000000000..b07973a4f6 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-sse4.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse4_1 + +#include "memcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S new file mode 100644 index 0000000000..a41ef95fc1 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_ssse3 + +#include "memcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S new file mode 100644 index 0000000000..94b25a214c --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemcmp.S @@ -0,0 +1,55 @@ +/* Multiple versions of wmemcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(wmemcmp) + .type wmemcmp, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 1f + HAS_ARCH_FEATURE (AVX2_Usable) + jz 1f + HAS_CPU_FEATURE (MOVBE) + jz 1f + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz 1f + leaq __wmemcmp_avx2_movbe(%rip), %rax + ret + +1: HAS_CPU_FEATURE (SSSE3) + jnz 2f + leaq __wmemcmp_sse2(%rip), %rax + ret + +2: HAS_CPU_FEATURE (SSE4_1) + jz 3f + leaq __wmemcmp_sse4_1(%rip), %rax + ret + +3: leaq __wmemcmp_ssse3(%rip), %rax + ret + +END(wmemcmp) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c new file mode 100644 index 0000000000..dd35be6e49 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset.c @@ -0,0 +1,33 @@ +/* Multiple versions of wmemset. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) +# define wmemset __redirect_wmemset +# define __wmemset __redirect___wmemset +# include <wchar.h> +# undef wmemset +# undef __wmemset + +# define SYMBOL_NAME wmemset +# include "ifunc-wmemset.h" + +libc_ifunc_redirected (__redirect_wmemset, __wmemset, IFUNC_SELECTOR ()); +weak_alias (__wmemset, wmemset) +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S new file mode 100644 index 0000000000..0a537fe272 --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk-nonshared.S @@ -0,0 +1,21 @@ +/* Non-shared version of wmemset_chk for x86-64. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) && !defined SHARED +# include "../wmemset_chk.S" +#endif diff --git a/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c new file mode 100644 index 0000000000..d3ded5595b --- /dev/null +++ b/REORG.TODO/sysdeps/x86_64/multiarch/wmemset_chk.c @@ -0,0 +1,31 @@ +/* Multiple versions of wmemset_chk. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Define multiple versions only for the definition in libc.so. */ +#if IS_IN (libc) && defined SHARED +# define __wmemset_chk __redirect_wmemset_chk +# include <wchar.h> +# undef __wmemset_chk + +# define SYMBOL_NAME wmemset_chk +# include "ifunc-wmemset.h" + +libc_ifunc_redirected (__redirect_wmemset_chk, __wmemset_chk, + IFUNC_SELECTOR ()); +#endif |