From 21596c745d841e6b3384f1af93cfe1df4afc6648 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 20 Aug 2015 06:28:49 -0700 Subject: Add i386 memcpy family multiarch functions --- sysdeps/i386/i586/multiarch/memcpy.c | 1 + sysdeps/i386/i586/multiarch/mempcpy.c | 1 + sysdeps/i386/i586/multiarch/rtld-memcpy.S | 19 + sysdeps/i386/i586/multiarch/static-memcpy.S | 21 + sysdeps/i386/i586/multiarch/static-mempcpy.S | 21 + sysdeps/i386/i686/multiarch/Makefile | 9 +- sysdeps/i386/i686/multiarch/bcopy-i386.S | 1 + sysdeps/i386/i686/multiarch/bcopy-i686.S | 7 + sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S | 4 - sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S | 4 - sysdeps/i386/i686/multiarch/bcopy-ssse3.S | 4 - sysdeps/i386/i686/multiarch/bcopy.S | 59 - sysdeps/i386/i686/multiarch/bcopy.c | 1 + sysdeps/i386/i686/multiarch/memcpy-i386.S | 1 + sysdeps/i386/i686/multiarch/memcpy-i586.S | 1 + sysdeps/i386/i686/multiarch/memcpy-i686.S | 7 + .../i386/i686/multiarch/memcpy-sse2-unaligned.S | 681 ----- sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S | 1809 ----------- sysdeps/i386/i686/multiarch/memcpy-ssse3.S | 3162 -------------------- sysdeps/i386/i686/multiarch/memcpy.S | 78 - sysdeps/i386/i686/multiarch/memcpy.c | 1 + sysdeps/i386/i686/multiarch/memcpy_chk.S | 50 - sysdeps/i386/i686/multiarch/memmove-i386.S | 1 + sysdeps/i386/i686/multiarch/memmove-i686.S | 7 + .../i386/i686/multiarch/memmove-sse2-unaligned.S | 4 - sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S | 4 - sysdeps/i386/i686/multiarch/memmove-ssse3.S | 4 - sysdeps/i386/i686/multiarch/memmove.S | 89 - sysdeps/i386/i686/multiarch/memmove.c | 1 + sysdeps/i386/i686/multiarch/memmove_chk.S | 94 - sysdeps/i386/i686/multiarch/mempcpy-i386.S | 1 + sysdeps/i386/i686/multiarch/mempcpy-i586.S | 1 + sysdeps/i386/i686/multiarch/mempcpy-i686.S | 10 + .../i386/i686/multiarch/mempcpy-sse2-unaligned.S | 4 - sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S | 4 - sysdeps/i386/i686/multiarch/mempcpy-ssse3.S | 4 - sysdeps/i386/i686/multiarch/mempcpy.S | 81 - sysdeps/i386/i686/multiarch/mempcpy.c | 1 + sysdeps/i386/i686/multiarch/mempcpy_chk.S | 50 - sysdeps/i386/i686/multiarch/rtld-memcpy.S | 19 + sysdeps/i386/i686/multiarch/rtld-memmove.S | 19 + sysdeps/i386/i686/multiarch/static-memcpy.S | 21 + sysdeps/i386/i686/multiarch/static-memmove.S | 21 + sysdeps/i386/i686/multiarch/static-mempcpy.S | 21 + sysdeps/i386/multiarch/Makefile | 11 +- sysdeps/i386/multiarch/bcopy-i386.S | 12 + sysdeps/i386/multiarch/bcopy-i686.S | 6 + sysdeps/i386/multiarch/bcopy-sse2-unaligned.S | 4 + sysdeps/i386/multiarch/bcopy-ssse3-rep.S | 4 + sysdeps/i386/multiarch/bcopy-ssse3.S | 4 + sysdeps/i386/multiarch/bcopy.c | 64 + sysdeps/i386/multiarch/ifunc-impl-list.c | 55 +- sysdeps/i386/multiarch/memcpy-i386.S | 11 + sysdeps/i386/multiarch/memcpy-i586.S | 7 + sysdeps/i386/multiarch/memcpy-i686.S | 7 + sysdeps/i386/multiarch/memcpy-sse2-unaligned.S | 681 +++++ sysdeps/i386/multiarch/memcpy-ssse3-rep.S | 1809 +++++++++++ sysdeps/i386/multiarch/memcpy-ssse3.S | 3162 ++++++++++++++++++++ sysdeps/i386/multiarch/memcpy.c | 69 + sysdeps/i386/multiarch/memcpy_chk.c | 70 + sysdeps/i386/multiarch/memmove-i386.S | 11 + sysdeps/i386/multiarch/memmove-i686.S | 7 + sysdeps/i386/multiarch/memmove-sse2-unaligned.S | 4 + sysdeps/i386/multiarch/memmove-ssse3-rep.S | 4 + sysdeps/i386/multiarch/memmove-ssse3.S | 4 + sysdeps/i386/multiarch/memmove.c | 66 + sysdeps/i386/multiarch/memmove_chk.c | 105 + sysdeps/i386/multiarch/mempcpy-i386.S | 18 + sysdeps/i386/multiarch/mempcpy-i586.S | 11 + sysdeps/i386/multiarch/mempcpy-i686.S | 11 + sysdeps/i386/multiarch/mempcpy-sse2-unaligned.S | 4 + sysdeps/i386/multiarch/mempcpy-ssse3-rep.S | 4 + sysdeps/i386/multiarch/mempcpy-ssse3.S | 4 + sysdeps/i386/multiarch/mempcpy.c | 71 + sysdeps/i386/multiarch/mempcpy_chk.c | 70 + sysdeps/i386/multiarch/rtld-memmove.S | 19 + sysdeps/i386/multiarch/static-memcpy.S | 21 + sysdeps/i386/multiarch/static-memmove.S | 21 + sysdeps/i386/multiarch/static-mempcpy.S | 21 + 79 files changed, 6650 insertions(+), 6205 deletions(-) create mode 100644 sysdeps/i386/i586/multiarch/memcpy.c create mode 100644 sysdeps/i386/i586/multiarch/mempcpy.c create mode 100644 sysdeps/i386/i586/multiarch/rtld-memcpy.S create mode 100644 sysdeps/i386/i586/multiarch/static-memcpy.S create mode 100644 sysdeps/i386/i586/multiarch/static-mempcpy.S create mode 100644 sysdeps/i386/i686/multiarch/bcopy-i386.S create mode 100644 sysdeps/i386/i686/multiarch/bcopy-i686.S delete mode 100644 sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S delete mode 100644 sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S delete mode 100644 sysdeps/i386/i686/multiarch/bcopy-ssse3.S delete mode 100644 sysdeps/i386/i686/multiarch/bcopy.S create mode 100644 sysdeps/i386/i686/multiarch/bcopy.c create mode 100644 sysdeps/i386/i686/multiarch/memcpy-i386.S create mode 100644 sysdeps/i386/i686/multiarch/memcpy-i586.S create mode 100644 sysdeps/i386/i686/multiarch/memcpy-i686.S delete mode 100644 sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S delete mode 100644 sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S delete mode 100644 sysdeps/i386/i686/multiarch/memcpy-ssse3.S delete mode 100644 sysdeps/i386/i686/multiarch/memcpy.S create mode 100644 sysdeps/i386/i686/multiarch/memcpy.c delete mode 100644 sysdeps/i386/i686/multiarch/memcpy_chk.S create mode 100644 sysdeps/i386/i686/multiarch/memmove-i386.S create mode 100644 sysdeps/i386/i686/multiarch/memmove-i686.S delete mode 100644 sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S delete mode 100644 sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S delete mode 100644 sysdeps/i386/i686/multiarch/memmove-ssse3.S delete mode 100644 sysdeps/i386/i686/multiarch/memmove.S create mode 100644 sysdeps/i386/i686/multiarch/memmove.c delete mode 100644 sysdeps/i386/i686/multiarch/memmove_chk.S create mode 100644 sysdeps/i386/i686/multiarch/mempcpy-i386.S create mode 100644 sysdeps/i386/i686/multiarch/mempcpy-i586.S create mode 100644 sysdeps/i386/i686/multiarch/mempcpy-i686.S delete mode 100644 sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S delete mode 100644 sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S delete mode 100644 sysdeps/i386/i686/multiarch/mempcpy-ssse3.S delete mode 100644 sysdeps/i386/i686/multiarch/mempcpy.S create mode 100644 sysdeps/i386/i686/multiarch/mempcpy.c delete mode 100644 sysdeps/i386/i686/multiarch/mempcpy_chk.S create mode 100644 sysdeps/i386/i686/multiarch/rtld-memcpy.S create mode 100644 sysdeps/i386/i686/multiarch/rtld-memmove.S create mode 100644 sysdeps/i386/i686/multiarch/static-memcpy.S create mode 100644 sysdeps/i386/i686/multiarch/static-memmove.S create mode 100644 sysdeps/i386/i686/multiarch/static-mempcpy.S create mode 100644 sysdeps/i386/multiarch/bcopy-i386.S create mode 100644 sysdeps/i386/multiarch/bcopy-i686.S create mode 100644 sysdeps/i386/multiarch/bcopy-sse2-unaligned.S create mode 100644 sysdeps/i386/multiarch/bcopy-ssse3-rep.S create mode 100644 sysdeps/i386/multiarch/bcopy-ssse3.S create mode 100644 sysdeps/i386/multiarch/bcopy.c create mode 100644 sysdeps/i386/multiarch/memcpy-i386.S create mode 100644 sysdeps/i386/multiarch/memcpy-i586.S create mode 100644 sysdeps/i386/multiarch/memcpy-i686.S create mode 100644 sysdeps/i386/multiarch/memcpy-sse2-unaligned.S create mode 100644 sysdeps/i386/multiarch/memcpy-ssse3-rep.S create mode 100644 sysdeps/i386/multiarch/memcpy-ssse3.S create mode 100644 sysdeps/i386/multiarch/memcpy.c create mode 100644 sysdeps/i386/multiarch/memcpy_chk.c create mode 100644 sysdeps/i386/multiarch/memmove-i386.S create mode 100644 sysdeps/i386/multiarch/memmove-i686.S create mode 100644 sysdeps/i386/multiarch/memmove-sse2-unaligned.S create mode 100644 sysdeps/i386/multiarch/memmove-ssse3-rep.S create mode 100644 sysdeps/i386/multiarch/memmove-ssse3.S create mode 100644 sysdeps/i386/multiarch/memmove.c create mode 100644 sysdeps/i386/multiarch/memmove_chk.c create mode 100644 sysdeps/i386/multiarch/mempcpy-i386.S create mode 100644 sysdeps/i386/multiarch/mempcpy-i586.S create mode 100644 sysdeps/i386/multiarch/mempcpy-i686.S create mode 100644 sysdeps/i386/multiarch/mempcpy-sse2-unaligned.S create mode 100644 sysdeps/i386/multiarch/mempcpy-ssse3-rep.S create mode 100644 sysdeps/i386/multiarch/mempcpy-ssse3.S create mode 100644 sysdeps/i386/multiarch/mempcpy.c create mode 100644 sysdeps/i386/multiarch/mempcpy_chk.c create mode 100644 sysdeps/i386/multiarch/rtld-memmove.S create mode 100644 sysdeps/i386/multiarch/static-memcpy.S create mode 100644 sysdeps/i386/multiarch/static-memmove.S create mode 100644 sysdeps/i386/multiarch/static-mempcpy.S diff --git a/sysdeps/i386/i586/multiarch/memcpy.c b/sysdeps/i386/i586/multiarch/memcpy.c new file mode 100644 index 0000000000..a23ae5c20c --- /dev/null +++ b/sysdeps/i386/i586/multiarch/memcpy.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/i386/i586/multiarch/mempcpy.c b/sysdeps/i386/i586/multiarch/mempcpy.c new file mode 100644 index 0000000000..1ae8773514 --- /dev/null +++ b/sysdeps/i386/i586/multiarch/mempcpy.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/i386/i586/multiarch/rtld-memcpy.S b/sysdeps/i386/i586/multiarch/rtld-memcpy.S new file mode 100644 index 0000000000..2e53b40139 --- /dev/null +++ b/sysdeps/i386/i586/multiarch/rtld-memcpy.S @@ -0,0 +1,19 @@ +/* memcpy for ld.so + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include diff --git a/sysdeps/i386/i586/multiarch/static-memcpy.S b/sysdeps/i386/i586/multiarch/static-memcpy.S new file mode 100644 index 0000000000..37b0917636 --- /dev/null +++ b/sysdeps/i386/i586/multiarch/static-memcpy.S @@ -0,0 +1,21 @@ +/* memcpy for libc.a + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if !defined SHARED && IS_IN (libc) +# include +#endif diff --git a/sysdeps/i386/i586/multiarch/static-mempcpy.S b/sysdeps/i386/i586/multiarch/static-mempcpy.S new file mode 100644 index 0000000000..418420fd50 --- /dev/null +++ b/sysdeps/i386/i586/multiarch/static-mempcpy.S @@ -0,0 +1,21 @@ +/* memcpy for libc.a + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if !defined SHARED && IS_IN (libc) +# include +#endif diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 59072b94da..3b1190eb12 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -1,8 +1,5 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy-ssse3 mempcpy-ssse3 \ - memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \ - memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ - strcmp-ssse3 \ +sysdep_routines += strcmp-ssse3 \ strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ memcmp-ssse3 memcmp-sse4 varshift \ strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \ @@ -16,9 +13,7 @@ sysdep_routines += memcpy-ssse3 mempcpy-ssse3 \ strnlen-sse2 strnlen-c \ strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \ strncase_l-c strncase-c strncase_l-ssse3 \ - strcasecmp_l-sse4 strncase_l-sse4 \ - bcopy-sse2-unaligned memcpy-sse2-unaligned \ - mempcpy-sse2-unaligned memmove-sse2-unaligned + strcasecmp_l-sse4 strncase_l-sse4 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/bcopy-i386.S b/sysdeps/i386/i686/multiarch/bcopy-i386.S new file mode 100644 index 0000000000..9d841c9fd1 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/bcopy-i386.S @@ -0,0 +1 @@ +/* Dummy file. */ diff --git a/sysdeps/i386/i686/multiarch/bcopy-i686.S b/sysdeps/i386/i686/multiarch/bcopy-i686.S new file mode 100644 index 0000000000..bb13d280d6 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/bcopy-i686.S @@ -0,0 +1,7 @@ +#include + +#ifdef SHARED + .globl __GI_bcopy + .hidden __GI_bcopy + __GI_bcopy = __bcopy_i686 +#endif diff --git a/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S deleted file mode 100644 index efef2a10dd..0000000000 --- a/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define USE_AS_BCOPY -#define MEMCPY __bcopy_sse2_unaligned -#include "memcpy-sse2-unaligned.S" diff --git a/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S deleted file mode 100644 index cbc8b420e8..0000000000 --- a/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define USE_AS_BCOPY -#define MEMCPY __bcopy_ssse3_rep -#include "memcpy-ssse3-rep.S" diff --git a/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/sysdeps/i386/i686/multiarch/bcopy-ssse3.S deleted file mode 100644 index 36aac44b9c..0000000000 --- a/sysdeps/i386/i686/multiarch/bcopy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define USE_AS_BCOPY -#define MEMCPY __bcopy_ssse3 -#include "memcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S deleted file mode 100644 index 3fc95dcba9..0000000000 --- a/sysdeps/i386/i686/multiarch/bcopy.S +++ /dev/null @@ -1,59 +0,0 @@ -/* Multiple versions of bcopy - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -/* Define multiple versions only for the definition in lib. */ -#if IS_IN (libc) - .text -ENTRY(bcopy) - .type bcopy, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - LOAD_FUNC_GOT_EAX (__bcopy_ia32) - HAS_CPU_FEATURE (SSE2) - jz 2f - LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned) - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 2f - HAS_CPU_FEATURE (SSSE3) - jz 2f - LOAD_FUNC_GOT_EAX (__bcopy_ssse3) - HAS_CPU_FEATURE (Fast_Rep_String) - jz 2f - LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep) -2: ret -END(bcopy) - -# undef ENTRY -# define ENTRY(name) \ - .type __bcopy_ia32, @function; \ - .p2align 4; \ - .globl __bcopy_ia32; \ - .hidden __bcopy_ia32; \ - __bcopy_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32 - -#endif - -#include "../bcopy.S" diff --git a/sysdeps/i386/i686/multiarch/bcopy.c b/sysdeps/i386/i686/multiarch/bcopy.c new file mode 100644 index 0000000000..6fb21e864c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/bcopy.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/i386/i686/multiarch/memcpy-i386.S b/sysdeps/i386/i686/multiarch/memcpy-i386.S new file mode 100644 index 0000000000..9d841c9fd1 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcpy-i386.S @@ -0,0 +1 @@ +/* Dummy file. */ diff --git a/sysdeps/i386/i686/multiarch/memcpy-i586.S b/sysdeps/i386/i686/multiarch/memcpy-i586.S new file mode 100644 index 0000000000..9d841c9fd1 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcpy-i586.S @@ -0,0 +1 @@ +/* Dummy file. */ diff --git a/sysdeps/i386/i686/multiarch/memcpy-i686.S b/sysdeps/i386/i686/multiarch/memcpy-i686.S new file mode 100644 index 0000000000..e7f84df9fe --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcpy-i686.S @@ -0,0 +1,7 @@ +#include + +#ifdef SHARED + .globl __GI_memcpy + .hidden __GI_memcpy + __GI_memcpy = __memcpy_i686 +#endif diff --git a/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S deleted file mode 100644 index 8215c70b15..0000000000 --- a/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S +++ /dev/null @@ -1,681 +0,0 @@ -/* memcpy optimized with SSE2 unaligned memory access instructions. - Copyright (C) 2014-2015 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) - -# include -# include "asm-syntax.h" - -# ifndef MEMCPY -# define MEMCPY __memcpy_sse2_unaligned -# define MEMCPY_CHK __memcpy_chk_sse2_unaligned -# endif - -# ifdef USE_AS_BCOPY -# define SRC PARMS -# define DEST SRC+4 -# define LEN DEST+4 -# else -# define DEST PARMS -# define SRC DEST+4 -# define LEN SRC+4 -# endif - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# define PARMS 8 /* Preserve EBX. */ -# define ENTRANCE PUSH (%ebx); -# define RETURN_END POP (%ebx); ret -# define RETURN RETURN_END; CFI_PUSH (%ebx) - - .section .text.sse2,"ax",@progbits -# if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -# endif - -ENTRY (MEMCPY) - ENTRANCE - movl LEN(%esp), %ecx - movl SRC(%esp), %eax - movl DEST(%esp), %edx - cmp %edx, %eax - -# ifdef USE_AS_MEMMOVE - jg L(check_forward) - -L(mm_len_0_or_more_backward): -/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] - separately. */ - cmp $16, %ecx - jbe L(mm_len_0_16_bytes_backward) - - cmpl $32, %ecx - jg L(mm_len_32_or_more_backward) - -/* Copy [0..32] and return. */ - movdqu (%eax), %xmm0 - movdqu -16(%eax, %ecx), %xmm1 - movdqu %xmm0, (%edx) - movdqu %xmm1, -16(%edx, %ecx) - jmp L(return) - -L(mm_len_32_or_more_backward): - cmpl $64, %ecx - jg L(mm_len_64_or_more_backward) - -/* Copy [0..64] and return. */ - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu -16(%eax, %ecx), %xmm2 - movdqu -32(%eax, %ecx), %xmm3 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, -16(%edx, %ecx) - movdqu %xmm3, -32(%edx, %ecx) - jmp L(return) - -L(mm_len_64_or_more_backward): - cmpl $128, %ecx - jg L(mm_len_128_or_more_backward) - -/* Copy [0..128] and return. */ - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movdqu -64(%eax, %ecx), %xmm4 - movdqu -48(%eax, %ecx), %xmm5 - movdqu -32(%eax, %ecx), %xmm6 - movdqu -16(%eax, %ecx), %xmm7 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, 32(%edx) - movdqu %xmm3, 48(%edx) - movdqu %xmm4, -64(%edx, %ecx) - movdqu %xmm5, -48(%edx, %ecx) - movdqu %xmm6, -32(%edx, %ecx) - movdqu %xmm7, -16(%edx, %ecx) - jmp L(return) - -L(mm_len_128_or_more_backward): - add %ecx, %eax - cmp %edx, %eax - movl SRC(%esp), %eax - jle L(forward) - PUSH (%esi) - PUSH (%edi) - PUSH (%ebx) - -/* Aligning the address of destination. */ - movdqu (%eax), %xmm4 - movdqu 16(%eax), %xmm5 - movdqu 32(%eax), %xmm6 - movdqu 48(%eax), %xmm7 - leal (%edx, %ecx), %esi - movdqu -16(%eax, %ecx), %xmm0 - subl $16, %esp - movdqu %xmm0, (%esp) - mov %ecx, %edi - movl %esi, %ecx - andl $-16, %ecx - leal (%ecx), %ebx - subl %edx, %ebx - leal (%eax, %ebx), %eax - shrl $6, %ebx - -# ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %edi -# else -# ifdef SHARED - PUSH (%ebx) - SETUP_PIC_REG (bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi - POP (%ebx) -# else - cmp __x86_shared_cache_size_half, %edi -# endif -# endif - jae L(mm_large_page_loop_backward) - - .p2align 4 -L(mm_main_loop_backward): - - prefetcht0 -128(%eax) - - movdqu -64(%eax), %xmm0 - movdqu -48(%eax), %xmm1 - movdqu -32(%eax), %xmm2 - movdqu -16(%eax), %xmm3 - movaps %xmm0, -64(%ecx) - subl $64, %eax - movaps %xmm1, -48(%ecx) - movaps %xmm2, -32(%ecx) - movaps %xmm3, -16(%ecx) - subl $64, %ecx - sub $1, %ebx - jnz L(mm_main_loop_backward) - movdqu (%esp), %xmm0 - addl $16, %esp - movdqu %xmm0, -16(%esi) - movdqu %xmm4, (%edx) - movdqu %xmm5, 16(%edx) - movdqu %xmm6, 32(%edx) - movdqu %xmm7, 48(%edx) - POP (%ebx) - jmp L(mm_return_pop_all) - -/* Copy [0..16] and return. */ -L(mm_len_0_16_bytes_backward): - testb $24, %cl - jnz L(mm_len_9_16_bytes_backward) - testb $4, %cl - .p2align 4,,5 - jnz L(mm_len_5_8_bytes_backward) - testl %ecx, %ecx - .p2align 4,,2 - je L(return) - testb $2, %cl - .p2align 4,,1 - jne L(mm_len_3_4_bytes_backward) - movzbl -1(%eax,%ecx), %ebx - movzbl (%eax), %eax - movb %bl, -1(%edx,%ecx) - movb %al, (%edx) - jmp L(return) - -L(mm_len_3_4_bytes_backward): - movzwl -2(%eax,%ecx), %ebx - movzwl (%eax), %eax - movw %bx, -2(%edx,%ecx) - movw %ax, (%edx) - jmp L(return) - -L(mm_len_9_16_bytes_backward): - PUSH (%esi) - movl -4(%eax,%ecx), %ebx - movl -8(%eax,%ecx), %esi - movl %ebx, -4(%edx,%ecx) - movl %esi, -8(%edx,%ecx) - subl $8, %ecx - POP (%esi) - jmp L(mm_len_0_16_bytes_backward) - -L(mm_len_5_8_bytes_backward): - movl (%eax), %ebx - movl -4(%eax,%ecx), %eax - movl %ebx, (%edx) - movl %eax, -4(%edx,%ecx) - jmp L(return) - -/* Big length copy backward part. */ - .p2align 4 -L(mm_large_page_loop_backward): - movdqu -64(%eax), %xmm0 - movdqu -48(%eax), %xmm1 - movdqu -32(%eax), %xmm2 - movdqu -16(%eax), %xmm3 - movntdq %xmm0, -64(%ecx) - subl $64, %eax - movntdq %xmm1, -48(%ecx) - movntdq %xmm2, -32(%ecx) - movntdq %xmm3, -16(%ecx) - subl $64, %ecx - sub $1, %ebx - jnz L(mm_large_page_loop_backward) - sfence - movdqu (%esp), %xmm0 - addl $16, %esp - movdqu %xmm0, -16(%esi) - movdqu %xmm4, (%edx) - movdqu %xmm5, 16(%edx) - movdqu %xmm6, 32(%edx) - movdqu %xmm7, 48(%edx) - POP (%ebx) - jmp L(mm_return_pop_all) - -L(check_forward): - add %edx, %ecx - cmp %eax, %ecx - movl LEN(%esp), %ecx - jle L(forward) - -/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] - separately. */ - cmp $16, %ecx - jbe L(mm_len_0_16_bytes_forward) - - cmpl $32, %ecx - ja L(mm_len_32_or_more_forward) - -/* Copy [0..32] and return. */ - movdqu (%eax), %xmm0 - movdqu -16(%eax, %ecx), %xmm1 - movdqu %xmm0, (%edx) - movdqu %xmm1, -16(%edx, %ecx) - jmp L(return) - -L(mm_len_32_or_more_forward): - cmpl $64, %ecx - ja L(mm_len_64_or_more_forward) - -/* Copy [0..64] and return. */ - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu -16(%eax, %ecx), %xmm2 - movdqu -32(%eax, %ecx), %xmm3 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, -16(%edx, %ecx) - movdqu %xmm3, -32(%edx, %ecx) - jmp L(return) - -L(mm_len_64_or_more_forward): - cmpl $128, %ecx - ja L(mm_len_128_or_more_forward) - -/* Copy [0..128] and return. */ - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movdqu -64(%eax, %ecx), %xmm4 - movdqu -48(%eax, %ecx), %xmm5 - movdqu -32(%eax, %ecx), %xmm6 - movdqu -16(%eax, %ecx), %xmm7 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, 32(%edx) - movdqu %xmm3, 48(%edx) - movdqu %xmm4, -64(%edx, %ecx) - movdqu %xmm5, -48(%edx, %ecx) - movdqu %xmm6, -32(%edx, %ecx) - movdqu %xmm7, -16(%edx, %ecx) - jmp L(return) - -L(mm_len_128_or_more_forward): - PUSH (%esi) - PUSH (%edi) - PUSH (%ebx) - -/* Aligning the address of destination. */ - movdqu -16(%eax, %ecx), %xmm4 - movdqu -32(%eax, %ecx), %xmm5 - movdqu -48(%eax, %ecx), %xmm6 - movdqu -64(%eax, %ecx), %xmm7 - leal (%edx, %ecx), %esi - movdqu (%eax), %xmm0 - subl $16, %esp - movdqu %xmm0, (%esp) - mov %ecx, %edi - leal 16(%edx), %ecx - andl $-16, %ecx - movl %ecx, %ebx - subl %edx, %ebx - addl %ebx, %eax - movl %esi, %ebx - subl %ecx, %ebx - shrl $6, %ebx - -# ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %edi -# else -# ifdef SHARED - PUSH (%ebx) - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi - POP (%ebx) -# else - cmp __x86_shared_cache_size_half, %edi -# endif -# endif - jae L(mm_large_page_loop_forward) - - .p2align 4 -L(mm_main_loop_forward): - - prefetcht0 128(%eax) - - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movdqa %xmm0, (%ecx) - addl $64, %eax - movaps %xmm1, 16(%ecx) - movaps %xmm2, 32(%ecx) - movaps %xmm3, 48(%ecx) - addl $64, %ecx - sub $1, %ebx - jnz L(mm_main_loop_forward) - movdqu (%esp), %xmm0 - addl $16, %esp - movdqu %xmm0, (%edx) - movdqu %xmm4, -16(%esi) - movdqu %xmm5, -32(%esi) - movdqu %xmm6, -48(%esi) - movdqu %xmm7, -64(%esi) - POP (%ebx) - jmp L(mm_return_pop_all) - -L(mm_len_0_16_bytes_forward): - testb $24, %cl - jne L(mm_len_9_16_bytes_forward) - testb $4, %cl - .p2align 4,,5 - jne L(mm_len_5_8_bytes_forward) - testl %ecx, %ecx - .p2align 4,,2 - je L(return) - testb $2, %cl - .p2align 4,,1 - jne L(mm_len_2_4_bytes_forward) - movzbl -1(%eax,%ecx), %ebx - movzbl (%eax), %eax - movb %bl, -1(%edx,%ecx) - movb %al, (%edx) - jmp L(return) - -L(mm_len_2_4_bytes_forward): - movzwl -2(%eax,%ecx), %ebx - movzwl (%eax), %eax - movw %bx, -2(%edx,%ecx) - movw %ax, (%edx) - jmp L(return) - -L(mm_len_5_8_bytes_forward): - movl (%eax), %ebx - movl -4(%eax,%ecx), %eax - movl %ebx, (%edx) - movl %eax, -4(%edx,%ecx) - jmp L(return) - -L(mm_len_9_16_bytes_forward): - movq (%eax), %xmm0 - movq -8(%eax, %ecx), %xmm1 - movq %xmm0, (%edx) - movq %xmm1, -8(%edx, %ecx) - jmp L(return) - -L(mm_return_pop_all): - movl %edx, %eax - POP (%edi) - POP (%esi) - RETURN - -/* Big length copy forward part. */ - .p2align 4 -L(mm_large_page_loop_forward): - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movntdq %xmm0, (%ecx) - addl $64, %eax - movntdq %xmm1, 16(%ecx) - movntdq %xmm2, 32(%ecx) - movntdq %xmm3, 48(%ecx) - addl $64, %ecx - sub $1, %ebx - jnz L(mm_large_page_loop_forward) - sfence - movdqu (%esp), %xmm0 - addl $16, %esp - movdqu %xmm0, (%edx) - movdqu %xmm4, -16(%esi) - movdqu %xmm5, -32(%esi) - movdqu %xmm6, -48(%esi) - movdqu %xmm7, -64(%esi) - POP (%ebx) - jmp L(mm_return_pop_all) -# endif - -L(forward): - cmp $16, %ecx - jbe L(len_0_16_bytes) - -# ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_shared_cache_size_half, %ecx -# endif -# endif - jae L(large_page) - - movdqu (%eax), %xmm0 - movdqu -16(%eax, %ecx), %xmm1 - cmpl $32, %ecx - movdqu %xmm0, (%edx) - movdqu %xmm1, -16(%edx, %ecx) - jbe L(return) - - movdqu 16(%eax), %xmm0 - movdqu -32(%eax, %ecx), %xmm1 - cmpl $64, %ecx - movdqu %xmm0, 16(%edx) - movdqu %xmm1, -32(%edx, %ecx) - jbe L(return) - - movdqu 32(%eax), %xmm0 - movdqu 48(%eax), %xmm1 - movdqu -48(%eax, %ecx), %xmm2 - movdqu -64(%eax, %ecx), %xmm3 - cmpl $128, %ecx - movdqu %xmm0, 32(%edx) - movdqu %xmm1, 48(%edx) - movdqu %xmm2, -48(%edx, %ecx) - movdqu %xmm3, -64(%edx, %ecx) - jbe L(return) - -/* Now the main loop: we align the address of the destination. */ - leal 64(%edx), %ebx - andl $-64, %ebx - - addl %edx, %ecx - andl $-64, %ecx - - subl %edx, %eax - -/* We should stop two iterations before the termination - (in order not to misprefetch). */ - subl $64, %ecx - cmpl %ebx, %ecx - je L(main_loop_just_one_iteration) - - subl $64, %ecx - cmpl %ebx, %ecx - je L(main_loop_last_two_iterations) - - .p2align 4 -L(main_loop_cache): - - prefetcht0 128(%ebx, %eax) - - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqa %xmm0, (%ebx) - movaps %xmm1, 16(%ebx) - movaps %xmm2, 32(%ebx) - movaps %xmm3, 48(%ebx) - lea 64(%ebx), %ebx - cmpl %ebx, %ecx - jne L(main_loop_cache) - -L(main_loop_last_two_iterations): - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqu 64(%ebx, %eax), %xmm4 - movdqu 80(%ebx, %eax), %xmm5 - movdqu 96(%ebx, %eax), %xmm6 - movdqu 112(%ebx, %eax), %xmm7 - movdqa %xmm0, (%ebx) - movaps %xmm1, 16(%ebx) - movaps %xmm2, 32(%ebx) - movaps %xmm3, 48(%ebx) - movaps %xmm4, 64(%ebx) - movaps %xmm5, 80(%ebx) - movaps %xmm6, 96(%ebx) - movaps %xmm7, 112(%ebx) - jmp L(return) - -L(main_loop_just_one_iteration): - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqa %xmm0, (%ebx) - movaps %xmm1, 16(%ebx) - movaps %xmm2, 32(%ebx) - movaps %xmm3, 48(%ebx) - jmp L(return) - -L(large_page): - movdqu (%eax), %xmm0 - movdqu 16(%eax), %xmm1 - movdqu 32(%eax), %xmm2 - movdqu 48(%eax), %xmm3 - movdqu -64(%eax, %ecx), %xmm4 - movdqu -48(%eax, %ecx), %xmm5 - movdqu -32(%eax, %ecx), %xmm6 - movdqu -16(%eax, %ecx), %xmm7 - movdqu %xmm0, (%edx) - movdqu %xmm1, 16(%edx) - movdqu %xmm2, 32(%edx) - movdqu %xmm3, 48(%edx) - movdqu %xmm4, -64(%edx, %ecx) - movdqu %xmm5, -48(%edx, %ecx) - movdqu %xmm6, -32(%edx, %ecx) - movdqu %xmm7, -16(%edx, %ecx) - - movdqu 64(%eax), %xmm0 - movdqu 80(%eax), %xmm1 - movdqu 96(%eax), %xmm2 - movdqu 112(%eax), %xmm3 - movdqu -128(%eax, %ecx), %xmm4 - movdqu -112(%eax, %ecx), %xmm5 - movdqu -96(%eax, %ecx), %xmm6 - movdqu -80(%eax, %ecx), %xmm7 - movdqu %xmm0, 64(%edx) - movdqu %xmm1, 80(%edx) - movdqu %xmm2, 96(%edx) - movdqu %xmm3, 112(%edx) - movdqu %xmm4, -128(%edx, %ecx) - movdqu %xmm5, -112(%edx, %ecx) - movdqu %xmm6, -96(%edx, %ecx) - movdqu %xmm7, -80(%edx, %ecx) - -/* Now the main loop with non temporal stores. We align - the address of the destination. */ - leal 128(%edx), %ebx - andl $-128, %ebx - - addl %edx, %ecx - andl $-128, %ecx - - subl %edx, %eax - - .p2align 4 -L(main_loop_large_page): - movdqu (%ebx, %eax), %xmm0 - movdqu 16(%ebx, %eax), %xmm1 - movdqu 32(%ebx, %eax), %xmm2 - movdqu 48(%ebx, %eax), %xmm3 - movdqu 64(%ebx, %eax), %xmm4 - movdqu 80(%ebx, %eax), %xmm5 - movdqu 96(%ebx, %eax), %xmm6 - movdqu 112(%ebx, %eax), %xmm7 - movntdq %xmm0, (%ebx) - movntdq %xmm1, 16(%ebx) - movntdq %xmm2, 32(%ebx) - movntdq %xmm3, 48(%ebx) - movntdq %xmm4, 64(%ebx) - movntdq %xmm5, 80(%ebx) - movntdq %xmm6, 96(%ebx) - movntdq %xmm7, 112(%ebx) - lea 128(%ebx), %ebx - cmpl %ebx, %ecx - jne L(main_loop_large_page) - sfence - jmp L(return) - -L(len_0_16_bytes): - testb $24, %cl - jne L(len_9_16_bytes) - testb $4, %cl - .p2align 4,,5 - jne L(len_5_8_bytes) - testl %ecx, %ecx - .p2align 4,,2 - je L(return) - movzbl (%eax), %ebx - testb $2, %cl - movb %bl, (%edx) - je L(return) - movzwl -2(%eax,%ecx), %ebx - movw %bx, -2(%edx,%ecx) - jmp L(return) - -L(len_9_16_bytes): - movq (%eax), %xmm0 - movq -8(%eax, %ecx), %xmm1 - movq %xmm0, (%edx) - movq %xmm1, -8(%edx, %ecx) - jmp L(return) - -L(len_5_8_bytes): - movl (%eax), %ebx - movl %ebx, (%edx) - movl -4(%eax,%ecx), %ebx - movl %ebx, -4(%edx,%ecx) - -L(return): - movl %edx, %eax -# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif - RETURN - -END (MEMCPY) -#endif diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S deleted file mode 100644 index 08d877c03a..0000000000 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S +++ /dev/null @@ -1,1809 +0,0 @@ -/* memcpy with SSSE3 and REP string. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include - -#if IS_IN (libc) \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3_rep -# define MEMCPY_CHK __memcpy_chk_ssse3_rep -#endif - -#ifdef USE_AS_BCOPY -# define SRC PARMS -# define DEST SRC+4 -# define LEN DEST+4 -#else -# define DEST PARMS -# define SRC DEST+4 -# define LEN SRC+4 -#endif - -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) - -#ifdef SHARED -# define PARMS 8 /* Preserve EBX. */ -# define ENTRANCE PUSH (%ebx); -# define RETURN_END POP (%ebx); ret -# define RETURN RETURN_END; CFI_PUSH (%ebx) -# define JMPTBL(I, B) I - B - -/* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into EBX. */ \ - SETUP_PIC_REG(bx); \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx - -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ - addl $(TABLE - .), %ebx - -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx -#else -# define PARMS 4 -# define ENTRANCE -# define RETURN_END ret -# define RETURN RETURN_END -# define JMPTBL(I, B) I - -/* Branch to an entry in a jump table. TABLE is a jump table with - absolute offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) - -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) - -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) -#endif - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif -ENTRY (MEMCPY) - ENTRANCE - movl LEN(%esp), %ecx - movl SRC(%esp), %eax - movl DEST(%esp), %edx - -#ifdef USE_AS_MEMMOVE - cmp %eax, %edx - jb L(copy_forward) - je L(fwd_write_0bytes) - cmp $48, %ecx - jb L(bk_write_less48bytes) - add %ecx, %eax - cmp %eax, %edx - movl SRC(%esp), %eax - jb L(copy_backward) - -L(copy_forward): -#endif - cmp $48, %ecx - jae L(48bytesormore) - -L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE - cmp %dl, %al - jb L(bk_write) -#endif - add %ecx, %edx - add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) -#ifndef USE_AS_MEMMOVE -L(bk_write): - BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) -#endif - - ALIGN (4) -/* ECX > 32 and EDX is 4 byte aligned. */ -L(48bytesormore): - movdqu (%eax), %xmm0 - PUSH (%edi) - movl %edx, %edi - and $-16, %edx - PUSH (%esi) - cfi_remember_state - add $16, %edx - movl %edi, %esi - sub %edx, %edi - add %edi, %ecx - sub %edi, %eax - -#ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %ecx -#else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_shared_cache_size_half, %ecx -# endif -#endif - - mov %eax, %edi - jae L(large_page) - and $0xf, %edi - jz L(shl_0) - - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) - - ALIGN (4) -L(shl_0): - movdqu %xmm0, (%esi) - xor %edi, %edi - cmp $127, %ecx - ja L(shl_0_gobble) - lea -32(%ecx), %ecx -L(shl_0_loop): - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jb L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jb L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jb L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi -L(shl_0_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - add %edi, %eax - POP (%esi) - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state -L(shl_0_gobble): - -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -#else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi -# else - mov __x86_data_cache_size_half, %edi -# endif -#endif - mov %edi, %esi - shr $3, %esi - sub %esi, %edi - cmp %edi, %ecx - jae L(shl_0_gobble_mem_start) - sub $128, %ecx - ALIGN (4) -L(shl_0_gobble_cache_loop): - movdqa (%eax), %xmm0 - movaps 0x10(%eax), %xmm1 - movaps 0x20(%eax), %xmm2 - movaps 0x30(%eax), %xmm3 - movaps 0x40(%eax), %xmm4 - movaps 0x50(%eax), %xmm5 - movaps 0x60(%eax), %xmm6 - movaps 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - sub $128, %ecx - movdqa %xmm0, (%edx) - movaps %xmm1, 0x10(%edx) - movaps %xmm2, 0x20(%edx) - movaps %xmm3, 0x30(%edx) - movaps %xmm4, 0x40(%edx) - movaps %xmm5, 0x50(%edx) - movaps %xmm6, 0x60(%edx) - movaps %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - - jae L(shl_0_gobble_cache_loop) - add $0x80, %ecx - cmp $0x40, %ecx - jb L(shl_0_cache_less_64bytes) - - movdqa (%eax), %xmm0 - sub $0x40, %ecx - movdqa 0x10(%eax), %xmm1 - - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - - movdqa 0x20(%eax), %xmm0 - movdqa 0x30(%eax), %xmm1 - add $0x40, %eax - - movdqa %xmm0, 0x20(%edx) - movdqa %xmm1, 0x30(%edx) - add $0x40, %edx -L(shl_0_cache_less_64bytes): - cmp $0x20, %ecx - jb L(shl_0_cache_less_32bytes) - movdqa (%eax), %xmm0 - sub $0x20, %ecx - movdqa 0x10(%eax), %xmm1 - add $0x20, %eax - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - add $0x20, %edx -L(shl_0_cache_less_32bytes): - cmp $0x10, %ecx - jb L(shl_0_cache_less_16bytes) - sub $0x10, %ecx - movdqa (%eax), %xmm0 - add $0x10, %eax - movdqa %xmm0, (%edx) - add $0x10, %edx -L(shl_0_cache_less_16bytes): - add %ecx, %edx - add %ecx, %eax - POP (%esi) - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_0_gobble_mem_start): - cmp %al, %dl - je L(copy_page_by_rep) - sub $128, %ecx -L(shl_0_gobble_mem_loop): - prefetchnta 0x1c0(%eax) - prefetchnta 0x280(%eax) - prefetchnta 0x1c0(%edx) - prefetchnta 0x280(%edx) - - movdqa (%eax), %xmm0 - movaps 0x10(%eax), %xmm1 - movaps 0x20(%eax), %xmm2 - movaps 0x30(%eax), %xmm3 - movaps 0x40(%eax), %xmm4 - movaps 0x50(%eax), %xmm5 - movaps 0x60(%eax), %xmm6 - movaps 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - sub $0x80, %ecx - movdqa %xmm0, (%edx) - movaps %xmm1, 0x10(%edx) - movaps %xmm2, 0x20(%edx) - movaps %xmm3, 0x30(%edx) - movaps %xmm4, 0x40(%edx) - movaps %xmm5, 0x50(%edx) - movaps %xmm6, 0x60(%edx) - movaps %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - - jae L(shl_0_gobble_mem_loop) - add $0x80, %ecx - cmp $0x40, %ecx - jb L(shl_0_mem_less_64bytes) - - movdqa (%eax), %xmm0 - sub $0x40, %ecx - movdqa 0x10(%eax), %xmm1 - - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - - movdqa 0x20(%eax), %xmm0 - movdqa 0x30(%eax), %xmm1 - add $0x40, %eax - - movdqa %xmm0, 0x20(%edx) - movdqa %xmm1, 0x30(%edx) - add $0x40, %edx -L(shl_0_mem_less_64bytes): - cmp $0x20, %ecx - jb L(shl_0_mem_less_32bytes) - movdqa (%eax), %xmm0 - sub $0x20, %ecx - movdqa 0x10(%eax), %xmm1 - add $0x20, %eax - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - add $0x20, %edx -L(shl_0_mem_less_32bytes): - cmp $0x10, %ecx - jb L(shl_0_mem_less_16bytes) - sub $0x10, %ecx - movdqa (%eax), %xmm0 - add $0x10, %eax - movdqa %xmm0, (%edx) - add $0x10, %edx -L(shl_0_mem_less_16bytes): - add %ecx, %edx - add %ecx, %eax - POP (%esi) - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_1): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $1, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_1_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - palignr $1, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_1_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $1, %xmm2, %xmm3 - palignr $1, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_1_loop) - -L(shl_1_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 1(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_2): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $2, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_2_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - palignr $2, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_2_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $2, %xmm2, %xmm3 - palignr $2, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_2_loop) - -L(shl_2_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 2(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_3): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $3, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_3_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - palignr $3, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_3_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $3, %xmm2, %xmm3 - palignr $3, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_3_loop) - -L(shl_3_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 3(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_4): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $4, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_4_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - palignr $4, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_4_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $4, %xmm2, %xmm3 - palignr $4, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_4_loop) - -L(shl_4_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 4(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_5): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $5, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_5_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - palignr $5, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_5_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $5, %xmm2, %xmm3 - palignr $5, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_5_loop) - -L(shl_5_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 5(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_6): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $6, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_6_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - palignr $6, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_6_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $6, %xmm2, %xmm3 - palignr $6, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_6_loop) - -L(shl_6_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 6(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_7): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $7, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_7_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - palignr $7, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_7_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $7, %xmm2, %xmm3 - palignr $7, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_7_loop) - -L(shl_7_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 7(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_8): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $8, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_8_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - palignr $8, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_8_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $8, %xmm2, %xmm3 - palignr $8, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_8_loop) - -L(shl_8_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 8(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_9): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $9, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_9_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - palignr $9, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_9_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $9, %xmm2, %xmm3 - palignr $9, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_9_loop) - -L(shl_9_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 9(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_10): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $10, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_10_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - palignr $10, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_10_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $10, %xmm2, %xmm3 - palignr $10, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_10_loop) - -L(shl_10_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 10(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_11): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $11, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_11_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - palignr $11, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_11_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $11, %xmm2, %xmm3 - palignr $11, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_11_loop) - -L(shl_11_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 11(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_12): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $12, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_12_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - palignr $12, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_12_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $12, %xmm2, %xmm3 - palignr $12, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_12_loop) - -L(shl_12_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 12(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_13): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $13, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_13_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - palignr $13, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_13_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $13, %xmm2, %xmm3 - palignr $13, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_13_loop) - -L(shl_13_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 13(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_14): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $14, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_14_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - palignr $14, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_14_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $14, %xmm2, %xmm3 - palignr $14, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_14_loop) - -L(shl_14_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 14(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(shl_15): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - sub $15, %eax - movaps (%eax), %xmm1 - xor %edi, %edi - sub $32, %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_15_loop): - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - palignr $15, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_15_end) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $15, %xmm2, %xmm3 - palignr $15, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(shl_15_loop) - -L(shl_15_end): - add $32, %ecx - add %ecx, %edi - add %edi, %edx - lea 15(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - - - ALIGN (4) -L(fwd_write_44bytes): - movl -44(%eax), %ecx - movl %ecx, -44(%edx) -L(fwd_write_40bytes): - movl -40(%eax), %ecx - movl %ecx, -40(%edx) -L(fwd_write_36bytes): - movl -36(%eax), %ecx - movl %ecx, -36(%edx) -L(fwd_write_32bytes): - movl -32(%eax), %ecx - movl %ecx, -32(%edx) -L(fwd_write_28bytes): - movl -28(%eax), %ecx - movl %ecx, -28(%edx) -L(fwd_write_24bytes): - movl -24(%eax), %ecx - movl %ecx, -24(%edx) -L(fwd_write_20bytes): - movl -20(%eax), %ecx - movl %ecx, -20(%edx) -L(fwd_write_16bytes): - movl -16(%eax), %ecx - movl %ecx, -16(%edx) -L(fwd_write_12bytes): - movl -12(%eax), %ecx - movl %ecx, -12(%edx) -L(fwd_write_8bytes): - movl -8(%eax), %ecx - movl %ecx, -8(%edx) -L(fwd_write_4bytes): - movl -4(%eax), %ecx - movl %ecx, -4(%edx) -L(fwd_write_0bytes): -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_5bytes): - movl -5(%eax), %ecx - movl -4(%eax), %eax - movl %ecx, -5(%edx) - movl %eax, -4(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_45bytes): - movl -45(%eax), %ecx - movl %ecx, -45(%edx) -L(fwd_write_41bytes): - movl -41(%eax), %ecx - movl %ecx, -41(%edx) -L(fwd_write_37bytes): - movl -37(%eax), %ecx - movl %ecx, -37(%edx) -L(fwd_write_33bytes): - movl -33(%eax), %ecx - movl %ecx, -33(%edx) -L(fwd_write_29bytes): - movl -29(%eax), %ecx - movl %ecx, -29(%edx) -L(fwd_write_25bytes): - movl -25(%eax), %ecx - movl %ecx, -25(%edx) -L(fwd_write_21bytes): - movl -21(%eax), %ecx - movl %ecx, -21(%edx) -L(fwd_write_17bytes): - movl -17(%eax), %ecx - movl %ecx, -17(%edx) -L(fwd_write_13bytes): - movl -13(%eax), %ecx - movl %ecx, -13(%edx) -L(fwd_write_9bytes): - movl -9(%eax), %ecx - movl %ecx, -9(%edx) - movl -5(%eax), %ecx - movl %ecx, -5(%edx) -L(fwd_write_1bytes): - movzbl -1(%eax), %ecx - movb %cl, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_46bytes): - movl -46(%eax), %ecx - movl %ecx, -46(%edx) -L(fwd_write_42bytes): - movl -42(%eax), %ecx - movl %ecx, -42(%edx) -L(fwd_write_38bytes): - movl -38(%eax), %ecx - movl %ecx, -38(%edx) -L(fwd_write_34bytes): - movl -34(%eax), %ecx - movl %ecx, -34(%edx) -L(fwd_write_30bytes): - movl -30(%eax), %ecx - movl %ecx, -30(%edx) -L(fwd_write_26bytes): - movl -26(%eax), %ecx - movl %ecx, -26(%edx) -L(fwd_write_22bytes): - movl -22(%eax), %ecx - movl %ecx, -22(%edx) -L(fwd_write_18bytes): - movl -18(%eax), %ecx - movl %ecx, -18(%edx) -L(fwd_write_14bytes): - movl -14(%eax), %ecx - movl %ecx, -14(%edx) -L(fwd_write_10bytes): - movl -10(%eax), %ecx - movl %ecx, -10(%edx) -L(fwd_write_6bytes): - movl -6(%eax), %ecx - movl %ecx, -6(%edx) -L(fwd_write_2bytes): - movzwl -2(%eax), %ecx - movw %cx, -2(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN - - ALIGN (4) -L(fwd_write_47bytes): - movl -47(%eax), %ecx - movl %ecx, -47(%edx) -L(fwd_write_43bytes): - movl -43(%eax), %ecx - movl %ecx, -43(%edx) -L(fwd_write_39bytes): - movl -39(%eax), %ecx - movl %ecx, -39(%edx) -L(fwd_write_35bytes): - movl -35(%eax), %ecx - movl %ecx, -35(%edx) -L(fwd_write_31bytes): - movl -31(%eax), %ecx - movl %ecx, -31(%edx) -L(fwd_write_27bytes): - movl -27(%eax), %ecx - movl %ecx, -27(%edx) -L(fwd_write_23bytes): - movl -23(%eax), %ecx - movl %ecx, -23(%edx) -L(fwd_write_19bytes): - movl -19(%eax), %ecx - movl %ecx, -19(%edx) -L(fwd_write_15bytes): - movl -15(%eax), %ecx - movl %ecx, -15(%edx) -L(fwd_write_11bytes): - movl -11(%eax), %ecx - movl %ecx, -11(%edx) -L(fwd_write_7bytes): - movl -7(%eax), %ecx - movl %ecx, -7(%edx) -L(fwd_write_3bytes): - movzwl -3(%eax), %ecx - movzbl -1(%eax), %eax - movw %cx, -3(%edx) - movb %al, -1(%edx) -#ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -#endif - RETURN_END - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(large_page): - movdqu (%eax), %xmm1 - movdqu %xmm0, (%esi) - movntdq %xmm1, (%edx) - add $0x10, %eax - add $0x10, %edx - sub $0x10, %ecx - cmp %al, %dl - je L(copy_page_by_rep) -L(large_page_loop_init): - POP (%esi) - sub $0x80, %ecx - POP (%edi) -L(large_page_loop): - prefetchnta 0x1c0(%eax) - prefetchnta 0x280(%eax) - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - movdqu 0x20(%eax), %xmm2 - movdqu 0x30(%eax), %xmm3 - movdqu 0x40(%eax), %xmm4 - movdqu 0x50(%eax), %xmm5 - movdqu 0x60(%eax), %xmm6 - movdqu 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - lfence - sub $0x80, %ecx - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - movntdq %xmm2, 0x20(%edx) - movntdq %xmm3, 0x30(%edx) - movntdq %xmm4, 0x40(%edx) - movntdq %xmm5, 0x50(%edx) - movntdq %xmm6, 0x60(%edx) - movntdq %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - jae L(large_page_loop) - add $0x80, %ecx - cmp $0x40, %ecx - jb L(large_page_less_64bytes) - - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - movdqu 0x20(%eax), %xmm2 - movdqu 0x30(%eax), %xmm3 - lea 0x40(%eax), %eax - - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - movntdq %xmm2, 0x20(%edx) - movntdq %xmm3, 0x30(%edx) - lea 0x40(%edx), %edx - sub $0x40, %ecx -L(large_page_less_64bytes): - cmp $32, %ecx - jb L(large_page_less_32bytes) - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - lea 0x20(%eax), %eax - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - lea 0x20(%edx), %edx - sub $0x20, %ecx -L(large_page_less_32bytes): - add %ecx, %edx - add %ecx, %eax - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - cfi_restore_state - cfi_remember_state - ALIGN (4) -L(copy_page_by_rep): - mov %eax, %esi - mov %edx, %edi - mov %ecx, %edx - shr $2, %ecx - and $3, %edx - rep movsl - jz L(copy_page_by_rep_exit) - cmp $2, %edx - jb L(copy_page_by_rep_left_1) - movzwl (%esi), %eax - movw %ax, (%edi) - add $2, %esi - add $2, %edi - sub $2, %edx - jz L(copy_page_by_rep_exit) -L(copy_page_by_rep_left_1): - movzbl (%esi), %eax - movb %al, (%edi) -L(copy_page_by_rep_exit): - POP (%esi) - POP (%edi) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_44bytes): - movl 40(%eax), %ecx - movl %ecx, 40(%edx) -L(bk_write_40bytes): - movl 36(%eax), %ecx - movl %ecx, 36(%edx) -L(bk_write_36bytes): - movl 32(%eax), %ecx - movl %ecx, 32(%edx) -L(bk_write_32bytes): - movl 28(%eax), %ecx - movl %ecx, 28(%edx) -L(bk_write_28bytes): - movl 24(%eax), %ecx - movl %ecx, 24(%edx) -L(bk_write_24bytes): - movl 20(%eax), %ecx - movl %ecx, 20(%edx) -L(bk_write_20bytes): - movl 16(%eax), %ecx - movl %ecx, 16(%edx) -L(bk_write_16bytes): - movl 12(%eax), %ecx - movl %ecx, 12(%edx) -L(bk_write_12bytes): - movl 8(%eax), %ecx - movl %ecx, 8(%edx) -L(bk_write_8bytes): - movl 4(%eax), %ecx - movl %ecx, 4(%edx) -L(bk_write_4bytes): - movl (%eax), %ecx - movl %ecx, (%edx) -L(bk_write_0bytes): -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_45bytes): - movl 41(%eax), %ecx - movl %ecx, 41(%edx) -L(bk_write_41bytes): - movl 37(%eax), %ecx - movl %ecx, 37(%edx) -L(bk_write_37bytes): - movl 33(%eax), %ecx - movl %ecx, 33(%edx) -L(bk_write_33bytes): - movl 29(%eax), %ecx - movl %ecx, 29(%edx) -L(bk_write_29bytes): - movl 25(%eax), %ecx - movl %ecx, 25(%edx) -L(bk_write_25bytes): - movl 21(%eax), %ecx - movl %ecx, 21(%edx) -L(bk_write_21bytes): - movl 17(%eax), %ecx - movl %ecx, 17(%edx) -L(bk_write_17bytes): - movl 13(%eax), %ecx - movl %ecx, 13(%edx) -L(bk_write_13bytes): - movl 9(%eax), %ecx - movl %ecx, 9(%edx) -L(bk_write_9bytes): - movl 5(%eax), %ecx - movl %ecx, 5(%edx) -L(bk_write_5bytes): - movl 1(%eax), %ecx - movl %ecx, 1(%edx) -L(bk_write_1bytes): - movzbl (%eax), %ecx - movb %cl, (%edx) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_46bytes): - movl 42(%eax), %ecx - movl %ecx, 42(%edx) -L(bk_write_42bytes): - movl 38(%eax), %ecx - movl %ecx, 38(%edx) -L(bk_write_38bytes): - movl 34(%eax), %ecx - movl %ecx, 34(%edx) -L(bk_write_34bytes): - movl 30(%eax), %ecx - movl %ecx, 30(%edx) -L(bk_write_30bytes): - movl 26(%eax), %ecx - movl %ecx, 26(%edx) -L(bk_write_26bytes): - movl 22(%eax), %ecx - movl %ecx, 22(%edx) -L(bk_write_22bytes): - movl 18(%eax), %ecx - movl %ecx, 18(%edx) -L(bk_write_18bytes): - movl 14(%eax), %ecx - movl %ecx, 14(%edx) -L(bk_write_14bytes): - movl 10(%eax), %ecx - movl %ecx, 10(%edx) -L(bk_write_10bytes): - movl 6(%eax), %ecx - movl %ecx, 6(%edx) -L(bk_write_6bytes): - movl 2(%eax), %ecx - movl %ecx, 2(%edx) -L(bk_write_2bytes): - movzwl (%eax), %ecx - movw %cx, (%edx) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN - - ALIGN (4) -L(bk_write_47bytes): - movl 43(%eax), %ecx - movl %ecx, 43(%edx) -L(bk_write_43bytes): - movl 39(%eax), %ecx - movl %ecx, 39(%edx) -L(bk_write_39bytes): - movl 35(%eax), %ecx - movl %ecx, 35(%edx) -L(bk_write_35bytes): - movl 31(%eax), %ecx - movl %ecx, 31(%edx) -L(bk_write_31bytes): - movl 27(%eax), %ecx - movl %ecx, 27(%edx) -L(bk_write_27bytes): - movl 23(%eax), %ecx - movl %ecx, 23(%edx) -L(bk_write_23bytes): - movl 19(%eax), %ecx - movl %ecx, 19(%edx) -L(bk_write_19bytes): - movl 15(%eax), %ecx - movl %ecx, 15(%edx) -L(bk_write_15bytes): - movl 11(%eax), %ecx - movl %ecx, 11(%edx) -L(bk_write_11bytes): - movl 7(%eax), %ecx - movl %ecx, 7(%edx) -L(bk_write_7bytes): - movl 3(%eax), %ecx - movl %ecx, 3(%edx) -L(bk_write_3bytes): - movzwl 1(%eax), %ecx - movw %cx, 1(%edx) - movzbl (%eax), %eax - movb %al, (%edx) -#ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -#endif - RETURN_END - - - .pushsection .rodata.ssse3,"a",@progbits - ALIGN (2) -L(table_48bytes_fwd): - .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) - - ALIGN (2) -L(shl_table): - .int JMPTBL (L(shl_0), L(shl_table)) - .int JMPTBL (L(shl_1), L(shl_table)) - .int JMPTBL (L(shl_2), L(shl_table)) - .int JMPTBL (L(shl_3), L(shl_table)) - .int JMPTBL (L(shl_4), L(shl_table)) - .int JMPTBL (L(shl_5), L(shl_table)) - .int JMPTBL (L(shl_6), L(shl_table)) - .int JMPTBL (L(shl_7), L(shl_table)) - .int JMPTBL (L(shl_8), L(shl_table)) - .int JMPTBL (L(shl_9), L(shl_table)) - .int JMPTBL (L(shl_10), L(shl_table)) - .int JMPTBL (L(shl_11), L(shl_table)) - .int JMPTBL (L(shl_12), L(shl_table)) - .int JMPTBL (L(shl_13), L(shl_table)) - .int JMPTBL (L(shl_14), L(shl_table)) - .int JMPTBL (L(shl_15), L(shl_table)) - - ALIGN (2) -L(table_48_bytes_bwd): - .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) - - .popsection - -#ifdef USE_AS_MEMMOVE - ALIGN (4) -L(copy_backward): - PUSH (%esi) - movl %eax, %esi - add %ecx, %edx - add %ecx, %esi - testl $0x3, %edx - jnz L(bk_align) - -L(bk_aligned_4): - cmp $64, %ecx - jae L(bk_write_more64bytes) - -L(bk_write_64bytesless): - cmp $32, %ecx - jb L(bk_write_less32bytes) - -L(bk_write_more32bytes): - /* Copy 32 bytes at a time. */ - sub $32, %ecx - movl -4(%esi), %eax - movl %eax, -4(%edx) - movl -8(%esi), %eax - movl %eax, -8(%edx) - movl -12(%esi), %eax - movl %eax, -12(%edx) - movl -16(%esi), %eax - movl %eax, -16(%edx) - movl -20(%esi), %eax - movl %eax, -20(%edx) - movl -24(%esi), %eax - movl %eax, -24(%edx) - movl -28(%esi), %eax - movl %eax, -28(%edx) - movl -32(%esi), %eax - movl %eax, -32(%edx) - sub $32, %edx - sub $32, %esi - -L(bk_write_less32bytes): - movl %esi, %eax - sub %ecx, %edx - sub %ecx, %eax - POP (%esi) -L(bk_write_less48bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) - - CFI_PUSH (%esi) - ALIGN (4) -L(bk_align): - cmp $8, %ecx - jbe L(bk_write_less32bytes) - testl $1, %edx - /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, - then (EDX & 2) must be != 0. */ - jz L(bk_got2) - sub $1, %esi - sub $1, %ecx - sub $1, %edx - movzbl (%esi), %eax - movb %al, (%edx) - - testl $2, %edx - jz L(bk_aligned_4) - -L(bk_got2): - sub $2, %esi - sub $2, %ecx - sub $2, %edx - movzwl (%esi), %eax - movw %ax, (%edx) - jmp L(bk_aligned_4) - - ALIGN (4) -L(bk_write_more64bytes): - /* Check alignment of last byte. */ - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - -/* EDX is aligned 4 bytes, but not 16 bytes. */ -L(bk_ssse3_align): - sub $4, %esi - sub $4, %ecx - sub $4, %edx - movl (%esi), %eax - movl %eax, (%edx) - - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - - sub $4, %esi - sub $4, %ecx - sub $4, %edx - movl (%esi), %eax - movl %eax, (%edx) - - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - - sub $4, %esi - sub $4, %ecx - sub $4, %edx - movl (%esi), %eax - movl %eax, (%edx) - -L(bk_ssse3_cpy_pre): - cmp $64, %ecx - jb L(bk_write_more32bytes) - -L(bk_ssse3_cpy): - sub $64, %esi - sub $64, %ecx - sub $64, %edx - movdqu 0x30(%esi), %xmm3 - movdqa %xmm3, 0x30(%edx) - movdqu 0x20(%esi), %xmm2 - movdqa %xmm2, 0x20(%edx) - movdqu 0x10(%esi), %xmm1 - movdqa %xmm1, 0x10(%edx) - movdqu (%esi), %xmm0 - movdqa %xmm0, (%edx) - cmp $64, %ecx - jae L(bk_ssse3_cpy) - jmp L(bk_write_64bytesless) - -#endif - -END (MEMCPY) - -#endif diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S deleted file mode 100644 index 27ab6a2c3e..0000000000 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S +++ /dev/null @@ -1,3162 +0,0 @@ -/* memcpy with SSSE3 - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#if IS_IN (libc) \ - && (defined SHARED \ - || defined USE_AS_MEMMOVE \ - || !defined USE_MULTIARCH) - -# include -# include "asm-syntax.h" - -# ifndef MEMCPY -# define MEMCPY __memcpy_ssse3 -# define MEMCPY_CHK __memcpy_chk_ssse3 -# endif - -# ifdef USE_AS_BCOPY -# define SRC PARMS -# define DEST SRC+4 -# define LEN DEST+4 -# else -# define DEST PARMS -# define SRC DEST+4 -# define LEN SRC+4 -# endif - -# define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ - cfi_rel_offset (REG, 0) - -# define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ - cfi_restore (REG) - -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) - -# ifdef SHARED -# define PARMS 8 /* Preserve EBX. */ -# define ENTRANCE PUSH (%ebx); -# define RETURN_END POP (%ebx); ret -# define RETURN RETURN_END; CFI_PUSH (%ebx) -# define JMPTBL(I, B) I - B - -/* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ - -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into EBX. */ \ - SETUP_PIC_REG(bx); \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx, INDEX, SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx -# else - -# define PARMS 4 -# define ENTRANCE -# define RETURN_END ret -# define RETURN RETURN_END -# define JMPTBL(I, B) I - -/* Branch to an entry in a jump table. TABLE is a jump table with - absolute offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ - -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(, INDEX, SCALE) -# endif - - .section .text.ssse3,"ax",@progbits -# if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -# endif -ENTRY (MEMCPY) - ENTRANCE - movl LEN(%esp), %ecx - movl SRC(%esp), %eax - movl DEST(%esp), %edx - -# ifdef USE_AS_MEMMOVE - cmp %eax, %edx - jb L(copy_forward) - je L(fwd_write_0bytes) - cmp $32, %ecx - jae L(memmove_bwd) - jmp L(bk_write_less32bytes_2) - - .p2align 4 -L(memmove_bwd): - add %ecx, %eax - cmp %eax, %edx - movl SRC(%esp), %eax - jb L(copy_backward) - -L(copy_forward): -# endif - cmp $48, %ecx - jae L(48bytesormore) - -L(fwd_write_less32bytes): -# ifndef USE_AS_MEMMOVE - cmp %dl, %al - jb L(bk_write) -# endif - add %ecx, %edx - add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) -# ifndef USE_AS_MEMMOVE - .p2align 4 -L(bk_write): - BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) -# endif - - .p2align 4 -L(48bytesormore): -# ifndef USE_AS_MEMMOVE - movlpd (%eax), %xmm0 - movlpd 8(%eax), %xmm1 - movlpd %xmm0, (%edx) - movlpd %xmm1, 8(%edx) -# else - movdqu (%eax), %xmm0 -# endif - PUSH (%edi) - movl %edx, %edi - and $-16, %edx - add $16, %edx - sub %edx, %edi - add %edi, %ecx - sub %edi, %eax - -# ifdef SHARED_CACHE_SIZE_HALF - cmp $SHARED_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_shared_cache_size_half, %ecx -# endif -# endif - - mov %eax, %edi - jae L(large_page) - and $0xf, %edi - jz L(shl_0) - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) - - .p2align 4 -L(shl_0): -# ifdef USE_AS_MEMMOVE - movl DEST+4(%esp), %edi - movdqu %xmm0, (%edi) -# endif - xor %edi, %edi - cmp $127, %ecx - ja L(shl_0_gobble) - lea -32(%ecx), %ecx - - .p2align 4 -L(shl_0_loop): - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jb L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jb L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - jb L(shl_0_end) - - movdqa (%eax, %edi), %xmm0 - movdqa 16(%eax, %edi), %xmm1 - sub $32, %ecx - movdqa %xmm0, (%edx, %edi) - movdqa %xmm1, 16(%edx, %edi) - lea 32(%edi), %edi - -L(shl_0_end): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - add %edi, %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_0_gobble): -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - POP (%edi) - lea -128(%ecx), %ecx - jae L(shl_0_gobble_mem_loop) - - .p2align 4 -L(shl_0_gobble_cache_loop): - movdqa (%eax), %xmm0 - movdqa 0x10(%eax), %xmm1 - movdqa 0x20(%eax), %xmm2 - movdqa 0x30(%eax), %xmm3 - movdqa 0x40(%eax), %xmm4 - movdqa 0x50(%eax), %xmm5 - movdqa 0x60(%eax), %xmm6 - movdqa 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - sub $128, %ecx - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - movdqa %xmm2, 0x20(%edx) - movdqa %xmm3, 0x30(%edx) - movdqa %xmm4, 0x40(%edx) - movdqa %xmm5, 0x50(%edx) - movdqa %xmm6, 0x60(%edx) - movdqa %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - - jae L(shl_0_gobble_cache_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(shl_0_cache_less_64bytes) - - movdqa (%eax), %xmm0 - sub $0x40, %ecx - movdqa 0x10(%eax), %xmm1 - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - movdqa 0x20(%eax), %xmm0 - movdqa 0x30(%eax), %xmm1 - add $0x40, %eax - movdqa %xmm0, 0x20(%edx) - movdqa %xmm1, 0x30(%edx) - add $0x40, %edx - -L(shl_0_cache_less_64bytes): - cmp $0x20, %ecx - jb L(shl_0_cache_less_32bytes) - movdqa (%eax), %xmm0 - sub $0x20, %ecx - movdqa 0x10(%eax), %xmm1 - add $0x20, %eax - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - add $0x20, %edx - -L(shl_0_cache_less_32bytes): - cmp $0x10, %ecx - jb L(shl_0_cache_less_16bytes) - sub $0x10, %ecx - movdqa (%eax), %xmm0 - add $0x10, %eax - movdqa %xmm0, (%edx) - add $0x10, %edx - -L(shl_0_cache_less_16bytes): - add %ecx, %edx - add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - .p2align 4 -L(shl_0_gobble_mem_loop): - prefetcht0 0x1c0(%eax) - prefetcht0 0x280(%eax) - prefetcht0 0x1c0(%edx) - - movdqa (%eax), %xmm0 - movdqa 0x10(%eax), %xmm1 - movdqa 0x20(%eax), %xmm2 - movdqa 0x30(%eax), %xmm3 - movdqa 0x40(%eax), %xmm4 - movdqa 0x50(%eax), %xmm5 - movdqa 0x60(%eax), %xmm6 - movdqa 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - sub $0x80, %ecx - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - movdqa %xmm2, 0x20(%edx) - movdqa %xmm3, 0x30(%edx) - movdqa %xmm4, 0x40(%edx) - movdqa %xmm5, 0x50(%edx) - movdqa %xmm6, 0x60(%edx) - movdqa %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - - jae L(shl_0_gobble_mem_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(shl_0_mem_less_64bytes) - - movdqa (%eax), %xmm0 - sub $0x40, %ecx - movdqa 0x10(%eax), %xmm1 - - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - - movdqa 0x20(%eax), %xmm0 - movdqa 0x30(%eax), %xmm1 - add $0x40, %eax - - movdqa %xmm0, 0x20(%edx) - movdqa %xmm1, 0x30(%edx) - add $0x40, %edx - -L(shl_0_mem_less_64bytes): - cmp $0x20, %ecx - jb L(shl_0_mem_less_32bytes) - movdqa (%eax), %xmm0 - sub $0x20, %ecx - movdqa 0x10(%eax), %xmm1 - add $0x20, %eax - movdqa %xmm0, (%edx) - movdqa %xmm1, 0x10(%edx) - add $0x20, %edx - -L(shl_0_mem_less_32bytes): - cmp $0x10, %ecx - jb L(shl_0_mem_less_16bytes) - sub $0x10, %ecx - movdqa (%eax), %xmm0 - add $0x10, %eax - movdqa %xmm0, (%edx) - add $0x10, %edx - -L(shl_0_mem_less_16bytes): - add %ecx, %edx - add %ecx, %eax - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) - - .p2align 4 -L(shl_1): -# ifndef USE_AS_MEMMOVE - movaps -1(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -1(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_1_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl1LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 15(%eax), %xmm2 - movaps 31(%eax), %xmm3 - movaps 47(%eax), %xmm4 - movaps 63(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $1, %xmm4, %xmm5 - palignr $1, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $1, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $1, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl1LoopStart) - -L(Shl1LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 15(%eax), %xmm2 - movaps 31(%eax), %xmm3 - palignr $1, %xmm2, %xmm3 - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_1_no_prefetch): - lea -32(%ecx), %ecx - lea -1(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_1_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - palignr $1, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_1_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $1, %xmm2, %xmm3 - palignr $1, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_1_no_prefetch_loop) - -L(sh_1_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 1(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_2): -# ifndef USE_AS_MEMMOVE - movaps -2(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -2(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_2_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl2LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 14(%eax), %xmm2 - movaps 30(%eax), %xmm3 - movaps 46(%eax), %xmm4 - movaps 62(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $2, %xmm4, %xmm5 - palignr $2, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $2, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $2, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl2LoopStart) - -L(Shl2LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 14(%eax), %xmm2 - movaps 30(%eax), %xmm3 - palignr $2, %xmm2, %xmm3 - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_2_no_prefetch): - lea -32(%ecx), %ecx - lea -2(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_2_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - palignr $2, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_2_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $2, %xmm2, %xmm3 - palignr $2, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_2_no_prefetch_loop) - -L(sh_2_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 2(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_3): -# ifndef USE_AS_MEMMOVE - movaps -3(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -3(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_3_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl3LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 13(%eax), %xmm2 - movaps 29(%eax), %xmm3 - movaps 45(%eax), %xmm4 - movaps 61(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $3, %xmm4, %xmm5 - palignr $3, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $3, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $3, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl3LoopStart) - -L(Shl3LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 13(%eax), %xmm2 - movaps 29(%eax), %xmm3 - palignr $3, %xmm2, %xmm3 - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_3_no_prefetch): - lea -32(%ecx), %ecx - lea -3(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_3_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - palignr $3, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(sh_3_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $3, %xmm2, %xmm3 - palignr $3, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(sh_3_no_prefetch_loop) - -L(sh_3_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 3(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_4): -# ifndef USE_AS_MEMMOVE - movaps -4(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -4(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_4_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl4LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 12(%eax), %xmm2 - movaps 28(%eax), %xmm3 - movaps 44(%eax), %xmm4 - movaps 60(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $4, %xmm4, %xmm5 - palignr $4, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $4, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $4, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl4LoopStart) - -L(Shl4LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 12(%eax), %xmm2 - movaps 28(%eax), %xmm3 - palignr $4, %xmm2, %xmm3 - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_4_no_prefetch): - lea -32(%ecx), %ecx - lea -4(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_4_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - palignr $4, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(sh_4_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $4, %xmm2, %xmm3 - palignr $4, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(sh_4_no_prefetch_loop) - -L(sh_4_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 4(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_5): -# ifndef USE_AS_MEMMOVE - movaps -5(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -5(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_5_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl5LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 11(%eax), %xmm2 - movaps 27(%eax), %xmm3 - movaps 43(%eax), %xmm4 - movaps 59(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $5, %xmm4, %xmm5 - palignr $5, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $5, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $5, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl5LoopStart) - -L(Shl5LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 11(%eax), %xmm2 - movaps 27(%eax), %xmm3 - palignr $5, %xmm2, %xmm3 - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_5_no_prefetch): - lea -32(%ecx), %ecx - lea -5(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_5_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - palignr $5, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(sh_5_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $5, %xmm2, %xmm3 - palignr $5, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(sh_5_no_prefetch_loop) - -L(sh_5_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 5(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_6): -# ifndef USE_AS_MEMMOVE - movaps -6(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -6(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_6_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl6LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 10(%eax), %xmm2 - movaps 26(%eax), %xmm3 - movaps 42(%eax), %xmm4 - movaps 58(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $6, %xmm4, %xmm5 - palignr $6, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $6, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $6, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl6LoopStart) - -L(Shl6LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 10(%eax), %xmm2 - movaps 26(%eax), %xmm3 - palignr $6, %xmm2, %xmm3 - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_6_no_prefetch): - lea -32(%ecx), %ecx - lea -6(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_6_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - palignr $6, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jb L(sh_6_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $6, %xmm2, %xmm3 - palignr $6, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - - jae L(sh_6_no_prefetch_loop) - -L(sh_6_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 6(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_7): -# ifndef USE_AS_MEMMOVE - movaps -7(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -7(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_7_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl7LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 9(%eax), %xmm2 - movaps 25(%eax), %xmm3 - movaps 41(%eax), %xmm4 - movaps 57(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $7, %xmm4, %xmm5 - palignr $7, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $7, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $7, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl7LoopStart) - -L(Shl7LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 9(%eax), %xmm2 - movaps 25(%eax), %xmm3 - palignr $7, %xmm2, %xmm3 - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_7_no_prefetch): - lea -32(%ecx), %ecx - lea -7(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_7_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - palignr $7, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_7_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $7, %xmm2, %xmm3 - palignr $7, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_7_no_prefetch_loop) - -L(sh_7_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 7(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_8): -# ifndef USE_AS_MEMMOVE - movaps -8(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -8(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_8_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl8LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 8(%eax), %xmm2 - movaps 24(%eax), %xmm3 - movaps 40(%eax), %xmm4 - movaps 56(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $8, %xmm4, %xmm5 - palignr $8, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $8, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $8, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl8LoopStart) - -L(LoopLeave8): - add $32, %ecx - jle L(shl_end_0) - - movaps 8(%eax), %xmm2 - movaps 24(%eax), %xmm3 - palignr $8, %xmm2, %xmm3 - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_8_no_prefetch): - lea -32(%ecx), %ecx - lea -8(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_8_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - palignr $8, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_8_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $8, %xmm2, %xmm3 - palignr $8, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_8_no_prefetch_loop) - -L(sh_8_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 8(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_9): -# ifndef USE_AS_MEMMOVE - movaps -9(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -9(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_9_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl9LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 7(%eax), %xmm2 - movaps 23(%eax), %xmm3 - movaps 39(%eax), %xmm4 - movaps 55(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $9, %xmm4, %xmm5 - palignr $9, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $9, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $9, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl9LoopStart) - -L(Shl9LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 7(%eax), %xmm2 - movaps 23(%eax), %xmm3 - palignr $9, %xmm2, %xmm3 - palignr $9, %xmm1, %xmm2 - - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_9_no_prefetch): - lea -32(%ecx), %ecx - lea -9(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_9_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - palignr $9, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_9_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $9, %xmm2, %xmm3 - palignr $9, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_9_no_prefetch_loop) - -L(sh_9_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 9(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_10): -# ifndef USE_AS_MEMMOVE - movaps -10(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -10(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_10_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl10LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 6(%eax), %xmm2 - movaps 22(%eax), %xmm3 - movaps 38(%eax), %xmm4 - movaps 54(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $10, %xmm4, %xmm5 - palignr $10, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $10, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $10, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl10LoopStart) - -L(Shl10LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 6(%eax), %xmm2 - movaps 22(%eax), %xmm3 - palignr $10, %xmm2, %xmm3 - palignr $10, %xmm1, %xmm2 - - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_10_no_prefetch): - lea -32(%ecx), %ecx - lea -10(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_10_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - palignr $10, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_10_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $10, %xmm2, %xmm3 - palignr $10, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_10_no_prefetch_loop) - -L(sh_10_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 10(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_11): -# ifndef USE_AS_MEMMOVE - movaps -11(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -11(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_11_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl11LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 5(%eax), %xmm2 - movaps 21(%eax), %xmm3 - movaps 37(%eax), %xmm4 - movaps 53(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $11, %xmm4, %xmm5 - palignr $11, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $11, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $11, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl11LoopStart) - -L(Shl11LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 5(%eax), %xmm2 - movaps 21(%eax), %xmm3 - palignr $11, %xmm2, %xmm3 - palignr $11, %xmm1, %xmm2 - - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_11_no_prefetch): - lea -32(%ecx), %ecx - lea -11(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_11_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - palignr $11, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_11_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $11, %xmm2, %xmm3 - palignr $11, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_11_no_prefetch_loop) - -L(sh_11_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 11(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_12): -# ifndef USE_AS_MEMMOVE - movaps -12(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -12(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_12_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl12LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 4(%eax), %xmm2 - movaps 20(%eax), %xmm3 - movaps 36(%eax), %xmm4 - movaps 52(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $12, %xmm4, %xmm5 - palignr $12, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $12, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $12, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl12LoopStart) - -L(Shl12LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 4(%eax), %xmm2 - movaps 20(%eax), %xmm3 - palignr $12, %xmm2, %xmm3 - palignr $12, %xmm1, %xmm2 - - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_12_no_prefetch): - lea -32(%ecx), %ecx - lea -12(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_12_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - palignr $12, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_12_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $12, %xmm2, %xmm3 - palignr $12, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_12_no_prefetch_loop) - -L(sh_12_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 12(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_13): -# ifndef USE_AS_MEMMOVE - movaps -13(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -13(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_13_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl13LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 3(%eax), %xmm2 - movaps 19(%eax), %xmm3 - movaps 35(%eax), %xmm4 - movaps 51(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $13, %xmm4, %xmm5 - palignr $13, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $13, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $13, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl13LoopStart) - -L(Shl13LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 3(%eax), %xmm2 - movaps 19(%eax), %xmm3 - palignr $13, %xmm2, %xmm3 - palignr $13, %xmm1, %xmm2 - - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_13_no_prefetch): - lea -32(%ecx), %ecx - lea -13(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_13_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - palignr $13, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_13_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $13, %xmm2, %xmm3 - palignr $13, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_13_no_prefetch_loop) - -L(sh_13_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 13(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_14): -# ifndef USE_AS_MEMMOVE - movaps -14(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -14(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_14_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl14LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 2(%eax), %xmm2 - movaps 18(%eax), %xmm3 - movaps 34(%eax), %xmm4 - movaps 50(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $14, %xmm4, %xmm5 - palignr $14, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $14, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $14, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl14LoopStart) - -L(Shl14LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 2(%eax), %xmm2 - movaps 18(%eax), %xmm3 - palignr $14, %xmm2, %xmm3 - palignr $14, %xmm1, %xmm2 - - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_14_no_prefetch): - lea -32(%ecx), %ecx - lea -14(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_14_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - palignr $14, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_14_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $14, %xmm2, %xmm3 - palignr $14, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_14_no_prefetch_loop) - -L(sh_14_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 14(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_15): -# ifndef USE_AS_MEMMOVE - movaps -15(%eax), %xmm1 -# else - movl DEST+4(%esp), %edi - movaps -15(%eax), %xmm1 - movdqu %xmm0, (%edi) -# endif -# ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %ecx -# else -# ifdef SHARED - SETUP_PIC_REG(bx) - add $_GLOBAL_OFFSET_TABLE_, %ebx - cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else - cmp __x86_data_cache_size_half, %ecx -# endif -# endif - jb L(sh_15_no_prefetch) - - lea -64(%ecx), %ecx - - .p2align 4 -L(Shl15LoopStart): - prefetcht0 0x1c0(%eax) - prefetcht0 0x1c0(%edx) - movaps 1(%eax), %xmm2 - movaps 17(%eax), %xmm3 - movaps 33(%eax), %xmm4 - movaps 49(%eax), %xmm5 - movaps %xmm5, %xmm7 - palignr $15, %xmm4, %xmm5 - palignr $15, %xmm3, %xmm4 - movaps %xmm5, 48(%edx) - palignr $15, %xmm2, %xmm3 - lea 64(%eax), %eax - palignr $15, %xmm1, %xmm2 - movaps %xmm4, 32(%edx) - movaps %xmm3, 16(%edx) - movaps %xmm7, %xmm1 - movaps %xmm2, (%edx) - lea 64(%edx), %edx - sub $64, %ecx - ja L(Shl15LoopStart) - -L(Shl15LoopLeave): - add $32, %ecx - jle L(shl_end_0) - - movaps 1(%eax), %xmm2 - movaps 17(%eax), %xmm3 - palignr $15, %xmm2, %xmm3 - palignr $15, %xmm1, %xmm2 - - movaps %xmm2, (%edx) - movaps %xmm3, 16(%edx) - lea 32(%edx, %ecx), %edx - lea 32(%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(sh_15_no_prefetch): - lea -32(%ecx), %ecx - lea -15(%eax), %eax - xor %edi, %edi - - .p2align 4 -L(sh_15_no_prefetch_loop): - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - palignr $15, %xmm1, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jb L(sh_15_end_no_prefetch_loop) - - movdqa 16(%eax, %edi), %xmm2 - sub $32, %ecx - movdqa 32(%eax, %edi), %xmm3 - movdqa %xmm3, %xmm1 - palignr $15, %xmm2, %xmm3 - palignr $15, %xmm4, %xmm2 - lea 32(%edi), %edi - movdqa %xmm2, -32(%edx, %edi) - movdqa %xmm3, -16(%edx, %edi) - jae L(sh_15_no_prefetch_loop) - -L(sh_15_end_no_prefetch_loop): - lea 32(%ecx), %ecx - add %ecx, %edi - add %edi, %edx - lea 15(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(shl_end_0): - lea 32(%ecx), %ecx - lea (%edx, %ecx), %edx - lea (%eax, %ecx), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - - .p2align 4 -L(fwd_write_44bytes): - movq -44(%eax), %xmm0 - movq %xmm0, -44(%edx) -L(fwd_write_36bytes): - movq -36(%eax), %xmm0 - movq %xmm0, -36(%edx) -L(fwd_write_28bytes): - movq -28(%eax), %xmm0 - movq %xmm0, -28(%edx) -L(fwd_write_20bytes): - movq -20(%eax), %xmm0 - movq %xmm0, -20(%edx) -L(fwd_write_12bytes): - movq -12(%eax), %xmm0 - movq %xmm0, -12(%edx) -L(fwd_write_4bytes): - movl -4(%eax), %ecx - movl %ecx, -4(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_40bytes): - movq -40(%eax), %xmm0 - movq %xmm0, -40(%edx) -L(fwd_write_32bytes): - movq -32(%eax), %xmm0 - movq %xmm0, -32(%edx) -L(fwd_write_24bytes): - movq -24(%eax), %xmm0 - movq %xmm0, -24(%edx) -L(fwd_write_16bytes): - movq -16(%eax), %xmm0 - movq %xmm0, -16(%edx) -L(fwd_write_8bytes): - movq -8(%eax), %xmm0 - movq %xmm0, -8(%edx) -L(fwd_write_0bytes): -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_5bytes): - movl -5(%eax), %ecx - movl -4(%eax), %eax - movl %ecx, -5(%edx) - movl %eax, -4(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_45bytes): - movq -45(%eax), %xmm0 - movq %xmm0, -45(%edx) -L(fwd_write_37bytes): - movq -37(%eax), %xmm0 - movq %xmm0, -37(%edx) -L(fwd_write_29bytes): - movq -29(%eax), %xmm0 - movq %xmm0, -29(%edx) -L(fwd_write_21bytes): - movq -21(%eax), %xmm0 - movq %xmm0, -21(%edx) -L(fwd_write_13bytes): - movq -13(%eax), %xmm0 - movq %xmm0, -13(%edx) - movl -5(%eax), %ecx - movl %ecx, -5(%edx) - movzbl -1(%eax), %ecx - movb %cl, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_41bytes): - movq -41(%eax), %xmm0 - movq %xmm0, -41(%edx) -L(fwd_write_33bytes): - movq -33(%eax), %xmm0 - movq %xmm0, -33(%edx) -L(fwd_write_25bytes): - movq -25(%eax), %xmm0 - movq %xmm0, -25(%edx) -L(fwd_write_17bytes): - movq -17(%eax), %xmm0 - movq %xmm0, -17(%edx) -L(fwd_write_9bytes): - movq -9(%eax), %xmm0 - movq %xmm0, -9(%edx) -L(fwd_write_1bytes): - movzbl -1(%eax), %ecx - movb %cl, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_46bytes): - movq -46(%eax), %xmm0 - movq %xmm0, -46(%edx) -L(fwd_write_38bytes): - movq -38(%eax), %xmm0 - movq %xmm0, -38(%edx) -L(fwd_write_30bytes): - movq -30(%eax), %xmm0 - movq %xmm0, -30(%edx) -L(fwd_write_22bytes): - movq -22(%eax), %xmm0 - movq %xmm0, -22(%edx) -L(fwd_write_14bytes): - movq -14(%eax), %xmm0 - movq %xmm0, -14(%edx) -L(fwd_write_6bytes): - movl -6(%eax), %ecx - movl %ecx, -6(%edx) - movzwl -2(%eax), %ecx - movw %cx, -2(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_42bytes): - movq -42(%eax), %xmm0 - movq %xmm0, -42(%edx) -L(fwd_write_34bytes): - movq -34(%eax), %xmm0 - movq %xmm0, -34(%edx) -L(fwd_write_26bytes): - movq -26(%eax), %xmm0 - movq %xmm0, -26(%edx) -L(fwd_write_18bytes): - movq -18(%eax), %xmm0 - movq %xmm0, -18(%edx) -L(fwd_write_10bytes): - movq -10(%eax), %xmm0 - movq %xmm0, -10(%edx) -L(fwd_write_2bytes): - movzwl -2(%eax), %ecx - movw %cx, -2(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_47bytes): - movq -47(%eax), %xmm0 - movq %xmm0, -47(%edx) -L(fwd_write_39bytes): - movq -39(%eax), %xmm0 - movq %xmm0, -39(%edx) -L(fwd_write_31bytes): - movq -31(%eax), %xmm0 - movq %xmm0, -31(%edx) -L(fwd_write_23bytes): - movq -23(%eax), %xmm0 - movq %xmm0, -23(%edx) -L(fwd_write_15bytes): - movq -15(%eax), %xmm0 - movq %xmm0, -15(%edx) -L(fwd_write_7bytes): - movl -7(%eax), %ecx - movl %ecx, -7(%edx) - movzwl -3(%eax), %ecx - movzbl -1(%eax), %eax - movw %cx, -3(%edx) - movb %al, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_43bytes): - movq -43(%eax), %xmm0 - movq %xmm0, -43(%edx) -L(fwd_write_35bytes): - movq -35(%eax), %xmm0 - movq %xmm0, -35(%edx) -L(fwd_write_27bytes): - movq -27(%eax), %xmm0 - movq %xmm0, -27(%edx) -L(fwd_write_19bytes): - movq -19(%eax), %xmm0 - movq %xmm0, -19(%edx) -L(fwd_write_11bytes): - movq -11(%eax), %xmm0 - movq %xmm0, -11(%edx) -L(fwd_write_3bytes): - movzwl -3(%eax), %ecx - movzbl -1(%eax), %eax - movw %cx, -3(%edx) - movb %al, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_40bytes_align): - movdqa -40(%eax), %xmm0 - movdqa %xmm0, -40(%edx) -L(fwd_write_24bytes_align): - movdqa -24(%eax), %xmm0 - movdqa %xmm0, -24(%edx) -L(fwd_write_8bytes_align): - movq -8(%eax), %xmm0 - movq %xmm0, -8(%edx) -L(fwd_write_0bytes_align): -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_32bytes_align): - movdqa -32(%eax), %xmm0 - movdqa %xmm0, -32(%edx) -L(fwd_write_16bytes_align): - movdqa -16(%eax), %xmm0 - movdqa %xmm0, -16(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_5bytes_align): - movl -5(%eax), %ecx - movl -4(%eax), %eax - movl %ecx, -5(%edx) - movl %eax, -4(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_45bytes_align): - movdqa -45(%eax), %xmm0 - movdqa %xmm0, -45(%edx) -L(fwd_write_29bytes_align): - movdqa -29(%eax), %xmm0 - movdqa %xmm0, -29(%edx) -L(fwd_write_13bytes_align): - movq -13(%eax), %xmm0 - movq %xmm0, -13(%edx) - movl -5(%eax), %ecx - movl %ecx, -5(%edx) - movzbl -1(%eax), %ecx - movb %cl, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_37bytes_align): - movdqa -37(%eax), %xmm0 - movdqa %xmm0, -37(%edx) -L(fwd_write_21bytes_align): - movdqa -21(%eax), %xmm0 - movdqa %xmm0, -21(%edx) - movl -5(%eax), %ecx - movl %ecx, -5(%edx) - movzbl -1(%eax), %ecx - movb %cl, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_41bytes_align): - movdqa -41(%eax), %xmm0 - movdqa %xmm0, -41(%edx) -L(fwd_write_25bytes_align): - movdqa -25(%eax), %xmm0 - movdqa %xmm0, -25(%edx) -L(fwd_write_9bytes_align): - movq -9(%eax), %xmm0 - movq %xmm0, -9(%edx) -L(fwd_write_1bytes_align): - movzbl -1(%eax), %ecx - movb %cl, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_33bytes_align): - movdqa -33(%eax), %xmm0 - movdqa %xmm0, -33(%edx) -L(fwd_write_17bytes_align): - movdqa -17(%eax), %xmm0 - movdqa %xmm0, -17(%edx) - movzbl -1(%eax), %ecx - movb %cl, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_46bytes_align): - movdqa -46(%eax), %xmm0 - movdqa %xmm0, -46(%edx) -L(fwd_write_30bytes_align): - movdqa -30(%eax), %xmm0 - movdqa %xmm0, -30(%edx) -L(fwd_write_14bytes_align): - movq -14(%eax), %xmm0 - movq %xmm0, -14(%edx) -L(fwd_write_6bytes_align): - movl -6(%eax), %ecx - movl %ecx, -6(%edx) - movzwl -2(%eax), %ecx - movw %cx, -2(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_38bytes_align): - movdqa -38(%eax), %xmm0 - movdqa %xmm0, -38(%edx) -L(fwd_write_22bytes_align): - movdqa -22(%eax), %xmm0 - movdqa %xmm0, -22(%edx) - movl -6(%eax), %ecx - movl %ecx, -6(%edx) - movzwl -2(%eax), %ecx - movw %cx, -2(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_42bytes_align): - movdqa -42(%eax), %xmm0 - movdqa %xmm0, -42(%edx) -L(fwd_write_26bytes_align): - movdqa -26(%eax), %xmm0 - movdqa %xmm0, -26(%edx) -L(fwd_write_10bytes_align): - movq -10(%eax), %xmm0 - movq %xmm0, -10(%edx) -L(fwd_write_2bytes_align): - movzwl -2(%eax), %ecx - movw %cx, -2(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_34bytes_align): - movdqa -34(%eax), %xmm0 - movdqa %xmm0, -34(%edx) -L(fwd_write_18bytes_align): - movdqa -18(%eax), %xmm0 - movdqa %xmm0, -18(%edx) - movzwl -2(%eax), %ecx - movw %cx, -2(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_47bytes_align): - movdqa -47(%eax), %xmm0 - movdqa %xmm0, -47(%edx) -L(fwd_write_31bytes_align): - movdqa -31(%eax), %xmm0 - movdqa %xmm0, -31(%edx) -L(fwd_write_15bytes_align): - movq -15(%eax), %xmm0 - movq %xmm0, -15(%edx) -L(fwd_write_7bytes_align): - movl -7(%eax), %ecx - movl %ecx, -7(%edx) - movzwl -3(%eax), %ecx - movzbl -1(%eax), %eax - movw %cx, -3(%edx) - movb %al, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_39bytes_align): - movdqa -39(%eax), %xmm0 - movdqa %xmm0, -39(%edx) -L(fwd_write_23bytes_align): - movdqa -23(%eax), %xmm0 - movdqa %xmm0, -23(%edx) - movl -7(%eax), %ecx - movl %ecx, -7(%edx) - movzwl -3(%eax), %ecx - movzbl -1(%eax), %eax - movw %cx, -3(%edx) - movb %al, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_43bytes_align): - movdqa -43(%eax), %xmm0 - movdqa %xmm0, -43(%edx) -L(fwd_write_27bytes_align): - movdqa -27(%eax), %xmm0 - movdqa %xmm0, -27(%edx) -L(fwd_write_11bytes_align): - movq -11(%eax), %xmm0 - movq %xmm0, -11(%edx) -L(fwd_write_3bytes_align): - movzwl -3(%eax), %ecx - movzbl -1(%eax), %eax - movw %cx, -3(%edx) - movb %al, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_35bytes_align): - movdqa -35(%eax), %xmm0 - movdqa %xmm0, -35(%edx) -L(fwd_write_19bytes_align): - movdqa -19(%eax), %xmm0 - movdqa %xmm0, -19(%edx) - movzwl -3(%eax), %ecx - movzbl -1(%eax), %eax - movw %cx, -3(%edx) - movb %al, -1(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_44bytes_align): - movdqa -44(%eax), %xmm0 - movdqa %xmm0, -44(%edx) -L(fwd_write_28bytes_align): - movdqa -28(%eax), %xmm0 - movdqa %xmm0, -28(%edx) -L(fwd_write_12bytes_align): - movq -12(%eax), %xmm0 - movq %xmm0, -12(%edx) -L(fwd_write_4bytes_align): - movl -4(%eax), %ecx - movl %ecx, -4(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN - - .p2align 4 -L(fwd_write_36bytes_align): - movdqa -36(%eax), %xmm0 - movdqa %xmm0, -36(%edx) -L(fwd_write_20bytes_align): - movdqa -20(%eax), %xmm0 - movdqa %xmm0, -20(%edx) - movl -4(%eax), %ecx - movl %ecx, -4(%edx) -# ifndef USE_AS_BCOPY -# ifdef USE_AS_MEMPCPY - movl %edx, %eax -# else - movl DEST(%esp), %eax -# endif -# endif - RETURN_END - - CFI_PUSH (%edi) - - .p2align 4 -L(large_page): - movdqu (%eax), %xmm1 -# ifdef USE_AS_MEMMOVE - movl DEST+4(%esp), %edi - movdqu %xmm0, (%edi) -# endif - lea 16(%eax), %eax - movntdq %xmm1, (%edx) - lea 16(%edx), %edx - lea -0x90(%ecx), %ecx - POP (%edi) - - .p2align 4 -L(large_page_loop): - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - movdqu 0x20(%eax), %xmm2 - movdqu 0x30(%eax), %xmm3 - movdqu 0x40(%eax), %xmm4 - movdqu 0x50(%eax), %xmm5 - movdqu 0x60(%eax), %xmm6 - movdqu 0x70(%eax), %xmm7 - lea 0x80(%eax), %eax - - sub $0x80, %ecx - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - movntdq %xmm2, 0x20(%edx) - movntdq %xmm3, 0x30(%edx) - movntdq %xmm4, 0x40(%edx) - movntdq %xmm5, 0x50(%edx) - movntdq %xmm6, 0x60(%edx) - movntdq %xmm7, 0x70(%edx) - lea 0x80(%edx), %edx - jae L(large_page_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(large_page_less_64bytes) - - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - movdqu 0x20(%eax), %xmm2 - movdqu 0x30(%eax), %xmm3 - lea 0x40(%eax), %eax - - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - movntdq %xmm2, 0x20(%edx) - movntdq %xmm3, 0x30(%edx) - lea 0x40(%edx), %edx - sub $0x40, %ecx -L(large_page_less_64bytes): - cmp $32, %ecx - jb L(large_page_less_32bytes) - movdqu (%eax), %xmm0 - movdqu 0x10(%eax), %xmm1 - lea 0x20(%eax), %eax - movntdq %xmm0, (%edx) - movntdq %xmm1, 0x10(%edx) - lea 0x20(%edx), %edx - sub $0x20, %ecx -L(large_page_less_32bytes): - add %ecx, %edx - add %ecx, %eax - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - .p2align 4 -L(bk_write_44bytes): - movq 36(%eax), %xmm0 - movq %xmm0, 36(%edx) -L(bk_write_36bytes): - movq 28(%eax), %xmm0 - movq %xmm0, 28(%edx) -L(bk_write_28bytes): - movq 20(%eax), %xmm0 - movq %xmm0, 20(%edx) -L(bk_write_20bytes): - movq 12(%eax), %xmm0 - movq %xmm0, 12(%edx) -L(bk_write_12bytes): - movq 4(%eax), %xmm0 - movq %xmm0, 4(%edx) -L(bk_write_4bytes): - movl (%eax), %ecx - movl %ecx, (%edx) -L(bk_write_0bytes): -# ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -# endif - RETURN - - .p2align 4 -L(bk_write_40bytes): - movq 32(%eax), %xmm0 - movq %xmm0, 32(%edx) -L(bk_write_32bytes): - movq 24(%eax), %xmm0 - movq %xmm0, 24(%edx) -L(bk_write_24bytes): - movq 16(%eax), %xmm0 - movq %xmm0, 16(%edx) -L(bk_write_16bytes): - movq 8(%eax), %xmm0 - movq %xmm0, 8(%edx) -L(bk_write_8bytes): - movq (%eax), %xmm0 - movq %xmm0, (%edx) -# ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -# endif - RETURN - - .p2align 4 -L(bk_write_45bytes): - movq 37(%eax), %xmm0 - movq %xmm0, 37(%edx) -L(bk_write_37bytes): - movq 29(%eax), %xmm0 - movq %xmm0, 29(%edx) -L(bk_write_29bytes): - movq 21(%eax), %xmm0 - movq %xmm0, 21(%edx) -L(bk_write_21bytes): - movq 13(%eax), %xmm0 - movq %xmm0, 13(%edx) -L(bk_write_13bytes): - movq 5(%eax), %xmm0 - movq %xmm0, 5(%edx) -L(bk_write_5bytes): - movl 1(%eax), %ecx - movl %ecx, 1(%edx) -L(bk_write_1bytes): - movzbl (%eax), %ecx - movb %cl, (%edx) -# ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -# endif - RETURN - - .p2align 4 -L(bk_write_41bytes): - movq 33(%eax), %xmm0 - movq %xmm0, 33(%edx) -L(bk_write_33bytes): - movq 25(%eax), %xmm0 - movq %xmm0, 25(%edx) -L(bk_write_25bytes): - movq 17(%eax), %xmm0 - movq %xmm0, 17(%edx) -L(bk_write_17bytes): - movq 9(%eax), %xmm0 - movq %xmm0, 9(%edx) -L(bk_write_9bytes): - movq 1(%eax), %xmm0 - movq %xmm0, 1(%edx) - movzbl (%eax), %ecx - movb %cl, (%edx) -# ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -# endif - RETURN - - .p2align 4 -L(bk_write_46bytes): - movq 38(%eax), %xmm0 - movq %xmm0, 38(%edx) -L(bk_write_38bytes): - movq 30(%eax), %xmm0 - movq %xmm0, 30(%edx) -L(bk_write_30bytes): - movq 22(%eax), %xmm0 - movq %xmm0, 22(%edx) -L(bk_write_22bytes): - movq 14(%eax), %xmm0 - movq %xmm0, 14(%edx) -L(bk_write_14bytes): - movq 6(%eax), %xmm0 - movq %xmm0, 6(%edx) -L(bk_write_6bytes): - movl 2(%eax), %ecx - movl %ecx, 2(%edx) - movzwl (%eax), %ecx - movw %cx, (%edx) -# ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -# endif - RETURN - - .p2align 4 -L(bk_write_42bytes): - movq 34(%eax), %xmm0 - movq %xmm0, 34(%edx) -L(bk_write_34bytes): - movq 26(%eax), %xmm0 - movq %xmm0, 26(%edx) -L(bk_write_26bytes): - movq 18(%eax), %xmm0 - movq %xmm0, 18(%edx) -L(bk_write_18bytes): - movq 10(%eax), %xmm0 - movq %xmm0, 10(%edx) -L(bk_write_10bytes): - movq 2(%eax), %xmm0 - movq %xmm0, 2(%edx) -L(bk_write_2bytes): - movzwl (%eax), %ecx - movw %cx, (%edx) -# ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -# endif - RETURN - - .p2align 4 -L(bk_write_47bytes): - movq 39(%eax), %xmm0 - movq %xmm0, 39(%edx) -L(bk_write_39bytes): - movq 31(%eax), %xmm0 - movq %xmm0, 31(%edx) -L(bk_write_31bytes): - movq 23(%eax), %xmm0 - movq %xmm0, 23(%edx) -L(bk_write_23bytes): - movq 15(%eax), %xmm0 - movq %xmm0, 15(%edx) -L(bk_write_15bytes): - movq 7(%eax), %xmm0 - movq %xmm0, 7(%edx) -L(bk_write_7bytes): - movl 3(%eax), %ecx - movl %ecx, 3(%edx) - movzwl 1(%eax), %ecx - movw %cx, 1(%edx) - movzbl (%eax), %eax - movb %al, (%edx) -# ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -# endif - RETURN - - .p2align 4 -L(bk_write_43bytes): - movq 35(%eax), %xmm0 - movq %xmm0, 35(%edx) -L(bk_write_35bytes): - movq 27(%eax), %xmm0 - movq %xmm0, 27(%edx) -L(bk_write_27bytes): - movq 19(%eax), %xmm0 - movq %xmm0, 19(%edx) -L(bk_write_19bytes): - movq 11(%eax), %xmm0 - movq %xmm0, 11(%edx) -L(bk_write_11bytes): - movq 3(%eax), %xmm0 - movq %xmm0, 3(%edx) -L(bk_write_3bytes): - movzwl 1(%eax), %ecx - movw %cx, 1(%edx) - movzbl (%eax), %eax - movb %al, (%edx) -# ifndef USE_AS_BCOPY - movl DEST(%esp), %eax -# ifdef USE_AS_MEMPCPY - movl LEN(%esp), %ecx - add %ecx, %eax -# endif -# endif - RETURN_END - - - .pushsection .rodata.ssse3,"a",@progbits - .p2align 2 -L(table_48bytes_fwd): - .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) - .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) - - .p2align 2 -L(table_48bytes_fwd_align): - .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) - .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) - - .p2align 2 -L(shl_table): - .int JMPTBL (L(shl_0), L(shl_table)) - .int JMPTBL (L(shl_1), L(shl_table)) - .int JMPTBL (L(shl_2), L(shl_table)) - .int JMPTBL (L(shl_3), L(shl_table)) - .int JMPTBL (L(shl_4), L(shl_table)) - .int JMPTBL (L(shl_5), L(shl_table)) - .int JMPTBL (L(shl_6), L(shl_table)) - .int JMPTBL (L(shl_7), L(shl_table)) - .int JMPTBL (L(shl_8), L(shl_table)) - .int JMPTBL (L(shl_9), L(shl_table)) - .int JMPTBL (L(shl_10), L(shl_table)) - .int JMPTBL (L(shl_11), L(shl_table)) - .int JMPTBL (L(shl_12), L(shl_table)) - .int JMPTBL (L(shl_13), L(shl_table)) - .int JMPTBL (L(shl_14), L(shl_table)) - .int JMPTBL (L(shl_15), L(shl_table)) - - .p2align 2 -L(table_48_bytes_bwd): - .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) - .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) - - .popsection - -# ifdef USE_AS_MEMMOVE - .p2align 4 -L(copy_backward): - PUSH (%edi) - movl %eax, %edi - lea (%ecx,%edx,1),%edx - lea (%ecx,%edi,1),%edi - testl $0x3, %edx - jnz L(bk_align) - -L(bk_aligned_4): - cmp $64, %ecx - jae L(bk_write_more64bytes) - -L(bk_write_64bytesless): - cmp $32, %ecx - jb L(bk_write_less32bytes) - -L(bk_write_more32bytes): - /* Copy 32 bytes at a time. */ - sub $32, %ecx - movq -8(%edi), %xmm0 - movq %xmm0, -8(%edx) - movq -16(%edi), %xmm0 - movq %xmm0, -16(%edx) - movq -24(%edi), %xmm0 - movq %xmm0, -24(%edx) - movq -32(%edi), %xmm0 - movq %xmm0, -32(%edx) - sub $32, %edx - sub $32, %edi - -L(bk_write_less32bytes): - movl %edi, %eax - sub %ecx, %edx - sub %ecx, %eax - POP (%edi) -L(bk_write_less32bytes_2): - BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) - - CFI_PUSH (%edi) - - .p2align 4 -L(bk_align): - cmp $8, %ecx - jbe L(bk_write_less32bytes) - testl $1, %edx - /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, - then (EDX & 2) must be != 0. */ - jz L(bk_got2) - sub $1, %edi - sub $1, %ecx - sub $1, %edx - movzbl (%edi), %eax - movb %al, (%edx) - - testl $2, %edx - jz L(bk_aligned_4) - -L(bk_got2): - sub $2, %edi - sub $2, %ecx - sub $2, %edx - movzwl (%edi), %eax - movw %ax, (%edx) - jmp L(bk_aligned_4) - - .p2align 4 -L(bk_write_more64bytes): - /* Check alignment of last byte. */ - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - -/* EDX is aligned 4 bytes, but not 16 bytes. */ -L(bk_ssse3_align): - sub $4, %edi - sub $4, %ecx - sub $4, %edx - movl (%edi), %eax - movl %eax, (%edx) - - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - - sub $4, %edi - sub $4, %ecx - sub $4, %edx - movl (%edi), %eax - movl %eax, (%edx) - - testl $15, %edx - jz L(bk_ssse3_cpy_pre) - - sub $4, %edi - sub $4, %ecx - sub $4, %edx - movl (%edi), %eax - movl %eax, (%edx) - -L(bk_ssse3_cpy_pre): - cmp $64, %ecx - jb L(bk_write_more32bytes) - - .p2align 4 -L(bk_ssse3_cpy): - sub $64, %edi - sub $64, %ecx - sub $64, %edx - movdqu 0x30(%edi), %xmm3 - movdqa %xmm3, 0x30(%edx) - movdqu 0x20(%edi), %xmm2 - movdqa %xmm2, 0x20(%edx) - movdqu 0x10(%edi), %xmm1 - movdqa %xmm1, 0x10(%edx) - movdqu (%edi), %xmm0 - movdqa %xmm0, (%edx) - cmp $64, %ecx - jae L(bk_ssse3_cpy) - jmp L(bk_write_64bytesless) - -# endif - -END (MEMCPY) - -#endif diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S deleted file mode 100644 index 9a4d183e01..0000000000 --- a/sysdeps/i386/i686/multiarch/memcpy.S +++ /dev/null @@ -1,78 +0,0 @@ -/* Multiple versions of memcpy - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -/* Define multiple versions only for the definition in lib and for - DSO. In static binaries we need memcpy before the initialization - happened. */ -#if defined SHARED && IS_IN (libc) - .text -ENTRY(memcpy) - .type memcpy, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - LOAD_FUNC_GOT_EAX (__memcpy_ia32) - HAS_CPU_FEATURE (SSE2) - jz 2f - LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned) - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 2f - HAS_CPU_FEATURE (SSSE3) - jz 2f - LOAD_FUNC_GOT_EAX (__memcpy_ssse3) - HAS_CPU_FEATURE (Fast_Rep_String) - jz 2f - LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep) -2: ret -END(memcpy) - -# undef ENTRY -# define ENTRY(name) \ - .type __memcpy_ia32, @function; \ - .p2align 4; \ - .globl __memcpy_ia32; \ - .hidden __memcpy_ia32; \ - __memcpy_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __memcpy_chk_ia32, @function; \ - .globl __memcpy_chk_ia32; \ - .p2align 4; \ - __memcpy_chk_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32 - -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memcpy; __GI_memcpy = __memcpy_ia32 -#endif - -#include "../memcpy.S" diff --git a/sysdeps/i386/i686/multiarch/memcpy.c b/sysdeps/i386/i686/multiarch/memcpy.c new file mode 100644 index 0000000000..a23ae5c20c --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memcpy.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S deleted file mode 100644 index 3bbd921555..0000000000 --- a/sysdeps/i386/i686/multiarch/memcpy_chk.S +++ /dev/null @@ -1,50 +0,0 @@ -/* Multiple versions of __memcpy_chk - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -/* Define multiple versions only for the definition in lib and for - DSO. There are no multiarch memcpy functions for static binaries. - */ -#if IS_IN (libc) -# ifdef SHARED - .text -ENTRY(__memcpy_chk) - .type __memcpy_chk, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32) - HAS_CPU_FEATURE (SSE2) - jz 2f - LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned) - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 2f - HAS_CPU_FEATURE (SSSE3) - jz 2f - LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3) - HAS_CPU_FEATURE (Fast_Rep_String) - jz 2f - LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep) -2: ret -END(__memcpy_chk) -# else -# include "../memcpy_chk.S" -# endif -#endif diff --git a/sysdeps/i386/i686/multiarch/memmove-i386.S b/sysdeps/i386/i686/multiarch/memmove-i386.S new file mode 100644 index 0000000000..9d841c9fd1 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memmove-i386.S @@ -0,0 +1 @@ +/* Dummy file. */ diff --git a/sysdeps/i386/i686/multiarch/memmove-i686.S b/sysdeps/i386/i686/multiarch/memmove-i686.S new file mode 100644 index 0000000000..35ad32749a --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memmove-i686.S @@ -0,0 +1,7 @@ +#include + +#ifdef SHARED + .globl __GI_memmove + .hidden __GI_memmove + __GI_memmove = __memmove_i686 +#endif diff --git a/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S deleted file mode 100644 index 3873594cb2..0000000000 --- a/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_sse2_unaligned -#define MEMCPY_CHK __memmove_chk_sse2_unaligned -#include "memcpy-sse2-unaligned.S" diff --git a/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S deleted file mode 100644 index d202fc4a13..0000000000 --- a/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3_rep -#define MEMCPY_CHK __memmove_chk_ssse3_rep -#include "memcpy-ssse3-rep.S" diff --git a/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/sysdeps/i386/i686/multiarch/memmove-ssse3.S deleted file mode 100644 index 295430b1ef..0000000000 --- a/sysdeps/i386/i686/multiarch/memmove-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3 -#define MEMCPY_CHK __memmove_chk_ssse3 -#include "memcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S deleted file mode 100644 index 2bf427fe93..0000000000 --- a/sysdeps/i386/i686/multiarch/memmove.S +++ /dev/null @@ -1,89 +0,0 @@ -/* Multiple versions of memmove - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -/* Define multiple versions only for the definition in lib. */ -#if IS_IN (libc) - .text -ENTRY(memmove) - .type memmove, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - LOAD_FUNC_GOT_EAX (__memmove_ia32) - HAS_CPU_FEATURE (SSE2) - jz 2f - LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned) - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 2f - HAS_CPU_FEATURE (SSSE3) - jz 2f - LOAD_FUNC_GOT_EAX (__memmove_ssse3) - HAS_ARCH_FEATURE (Fast_Rep_String) - jz 2f - LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep) -2: ret -END(memmove) - -# ifdef SHARED -# undef ENTRY -# define ENTRY(name) \ - .type __memmove_ia32, @function; \ - .p2align 4; \ - .globl __memmove_ia32; \ - .hidden __memmove_ia32; \ - __memmove_ia32: cfi_startproc; \ - CALL_MCOUNT -# else -# undef ENTRY -# define ENTRY(name) \ - .type __memmove_ia32, @function; \ - .globl __memmove_ia32; \ - .p2align 4; \ - __memmove_ia32: cfi_startproc; \ - CALL_MCOUNT -# endif - -# undef END -# define END(name) \ - cfi_endproc; .size __memmove_ia32, .-__memmove_ia32 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __memmove_chk_ia32, @function; \ - .globl __memmove_chk_ia32; \ - .p2align 4; \ - __memmove_chk_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32 - -# ifdef SHARED -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_builtin_def(name) \ - .globl __GI_memmove; __GI_memmove = __memmove_ia32 -# endif -#endif - -#include "../memmove.S" diff --git a/sysdeps/i386/i686/multiarch/memmove.c b/sysdeps/i386/i686/multiarch/memmove.c new file mode 100644 index 0000000000..5953add6e4 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/memmove.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S deleted file mode 100644 index b17f6edbdc..0000000000 --- a/sysdeps/i386/i686/multiarch/memmove_chk.S +++ /dev/null @@ -1,94 +0,0 @@ -/* Multiple versions of __memmove_chk - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -/* Define multiple versions only for the definition in lib. */ -#if IS_IN (libc) - .text -ENTRY(__memmove_chk) - .type __memmove_chk, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - LOAD_FUNC_GOT_EAX (__memmove_chk_ia32) - HAS_CPU_FEATURE (SSE2) - jz 2f - LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned) - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 2f - HAS_CPU_FEATURE (SSSE3) - jz 2f - LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3) - HAS_CPU_FEATURE (Fast_Rep_String) - jz 2f - LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep) -2: ret -END(__memmove_chk) - -# ifndef SHARED - .type __memmove_chk_sse2_unaligned, @function - .p2align 4; -__memmove_chk_sse2_unaligned: - cfi_startproc - CALL_MCOUNT - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb __chk_fail - jmp __memmove_sse2_unaligned - cfi_endproc - .size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned - - .type __memmove_chk_ssse3, @function - .p2align 4; -__memmove_chk_ssse3: - cfi_startproc - CALL_MCOUNT - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb __chk_fail - jmp __memmove_ssse3 - cfi_endproc - .size __memmove_chk_ssse3, .-__memmove_chk_ssse3 - - .type __memmove_chk_ssse3_rep, @function - .p2align 4; -__memmove_chk_ssse3_rep: - cfi_startproc - CALL_MCOUNT - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb __chk_fail - jmp __memmove_ssse3_rep - cfi_endproc - .size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep - - .type __memmove_chk_ia32, @function - .p2align 4; -__memmove_chk_ia32: - cfi_startproc - CALL_MCOUNT - movl 12(%esp), %eax - cmpl %eax, 16(%esp) - jb __chk_fail - jmp __memmove_ia32 - cfi_endproc - .size __memmove_chk_ia32, .-__memmove_chk_ia32 -# endif -#endif diff --git a/sysdeps/i386/i686/multiarch/mempcpy-i386.S b/sysdeps/i386/i686/multiarch/mempcpy-i386.S new file mode 100644 index 0000000000..9d841c9fd1 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/mempcpy-i386.S @@ -0,0 +1 @@ +/* Dummy file. */ diff --git a/sysdeps/i386/i686/multiarch/mempcpy-i586.S b/sysdeps/i386/i686/multiarch/mempcpy-i586.S new file mode 100644 index 0000000000..9d841c9fd1 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/mempcpy-i586.S @@ -0,0 +1 @@ +/* Dummy file. */ diff --git a/sysdeps/i386/i686/multiarch/mempcpy-i686.S b/sysdeps/i386/i686/multiarch/mempcpy-i686.S new file mode 100644 index 0000000000..f87cac34af --- /dev/null +++ b/sysdeps/i386/i686/multiarch/mempcpy-i686.S @@ -0,0 +1,10 @@ +#include + +#ifdef SHARED + .globl __GI_mempcpy + .hidden __GI_mempcpy + __GI_mempcpy = __mempcpy_i686 + .globl __GI___mempcpy + .hidden __GI___mempcpy + __GI___mempcpy = __mempcpy_i686 +#endif diff --git a/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S deleted file mode 100644 index a1cea50771..0000000000 --- a/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMPCPY -#define MEMCPY __mempcpy_sse2_unaligned -#define MEMCPY_CHK __mempcpy_chk_sse2_unaligned -#include "memcpy-sse2-unaligned.S" diff --git a/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S deleted file mode 100644 index 5357b33e18..0000000000 --- a/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMPCPY -#define MEMCPY __mempcpy_ssse3_rep -#define MEMCPY_CHK __mempcpy_chk_ssse3_rep -#include "memcpy-ssse3-rep.S" diff --git a/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S deleted file mode 100644 index 822d98e954..0000000000 --- a/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMPCPY -#define MEMCPY __mempcpy_ssse3 -#define MEMCPY_CHK __mempcpy_chk_ssse3 -#include "memcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S deleted file mode 100644 index 021558a5b0..0000000000 --- a/sysdeps/i386/i686/multiarch/mempcpy.S +++ /dev/null @@ -1,81 +0,0 @@ -/* Multiple versions of mempcpy - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -/* Define multiple versions only for the definition in lib and for - DSO. In static binaries we need mempcpy before the initialization - happened. */ -#if defined SHARED && IS_IN (libc) - .text -ENTRY(__mempcpy) - .type __mempcpy, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - LOAD_FUNC_GOT_EAX (__mempcpy_ia32) - HAS_CPU_FEATURE (SSE2) - jz 2f - LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned) - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 2f - HAS_CPU_FEATURE (SSSE3) - jz 2f - LOAD_FUNC_GOT_EAX (__mempcpy_ssse3) - HAS_CPU_FEATURE (Fast_Rep_String) - jz 2f - LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep) -2: ret -END(__mempcpy) - -# undef ENTRY -# define ENTRY(name) \ - .type __mempcpy_ia32, @function; \ - .p2align 4; \ - .globl __mempcpy_ia32; \ - .hidden __mempcpy_ia32; \ - __mempcpy_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END -# define END(name) \ - cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32 - -# undef ENTRY_CHK -# define ENTRY_CHK(name) \ - .type __mempcpy_chk_ia32, @function; \ - .globl __mempcpy_chk_ia32; \ - .p2align 4; \ - __mempcpy_chk_ia32: cfi_startproc; \ - CALL_MCOUNT -# undef END_CHK -# define END_CHK(name) \ - cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32 - -# undef libc_hidden_def -# undef libc_hidden_builtin_def -/* IFUNC doesn't work with the hidden functions in shared library since - they will be called without setting up EBX needed for PLT which is - used by IFUNC. */ -# define libc_hidden_def(name) \ - .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32 -# define libc_hidden_builtin_def(name) \ - .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32 -#endif - -#include "../mempcpy.S" diff --git a/sysdeps/i386/i686/multiarch/mempcpy.c b/sysdeps/i386/i686/multiarch/mempcpy.c new file mode 100644 index 0000000000..1ae8773514 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/mempcpy.c @@ -0,0 +1 @@ +#include diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S deleted file mode 100644 index 1bea6eab38..0000000000 --- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S +++ /dev/null @@ -1,50 +0,0 @@ -/* Multiple versions of __mempcpy_chk - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2010-2015 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - . */ - -#include -#include - -/* Define multiple versions only for the definition in lib and for - DSO. There are no multiarch mempcpy functions for static binaries. - */ -#if IS_IN (libc) -# ifdef SHARED - .text -ENTRY(__mempcpy_chk) - .type __mempcpy_chk, @gnu_indirect_function - LOAD_GOT_AND_RTLD_GLOBAL_RO - LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32) - HAS_CPU_FEATURE (SSE2) - jz 2f - LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned) - HAS_ARCH_FEATURE (Fast_Unaligned_Load) - jnz 2f - HAS_CPU_FEATURE (SSSE3) - jz 2f - LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3) - HAS_CPU_FEATURE (Fast_Rep_String) - jz 2f - LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep) -2: ret -END(__mempcpy_chk) -# else -# include "../mempcpy_chk.S" -# endif -#endif diff --git a/sysdeps/i386/i686/multiarch/rtld-memcpy.S b/sysdeps/i386/i686/multiarch/rtld-memcpy.S new file mode 100644 index 0000000000..0ab09fb554 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/rtld-memcpy.S @@ -0,0 +1,19 @@ +/* memcpy for ld.so + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include diff --git a/sysdeps/i386/i686/multiarch/rtld-memmove.S b/sysdeps/i386/i686/multiarch/rtld-memmove.S new file mode 100644 index 0000000000..38f589af04 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/rtld-memmove.S @@ -0,0 +1,19 @@ +/* memmove for ld.so + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include diff --git a/sysdeps/i386/i686/multiarch/static-memcpy.S b/sysdeps/i386/i686/multiarch/static-memcpy.S new file mode 100644 index 0000000000..ef9f0b931f --- /dev/null +++ b/sysdeps/i386/i686/multiarch/static-memcpy.S @@ -0,0 +1,21 @@ +/* memcpy for libc.a + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if !defined SHARED && IS_IN (libc) +# include +#endif diff --git a/sysdeps/i386/i686/multiarch/static-memmove.S b/sysdeps/i386/i686/multiarch/static-memmove.S new file mode 100644 index 0000000000..6fc3a39664 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/static-memmove.S @@ -0,0 +1,21 @@ +/* memmove for libc.a + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if !defined SHARED && IS_IN (libc) +# include +#endif diff --git a/sysdeps/i386/i686/multiarch/static-mempcpy.S b/sysdeps/i386/i686/multiarch/static-mempcpy.S new file mode 100644 index 0000000000..60db75c61b --- /dev/null +++ b/sysdeps/i386/i686/multiarch/static-mempcpy.S @@ -0,0 +1,21 @@ +/* memcpy for libc.a + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if !defined SHARED && IS_IN (libc) +# include +#endif diff --git a/sysdeps/i386/multiarch/Makefile b/sysdeps/i386/multiarch/Makefile index 44fabedd1d..58dffcd78e 100644 --- a/sysdeps/i386/multiarch/Makefile +++ b/sysdeps/i386/multiarch/Makefile @@ -5,7 +5,16 @@ endif ifeq ($(subdir),string) gen-as-const-headers += locale-defines.sym -sysdep_routines += bzero-i386 bzero-i586 bzero-i686 \ +sysdep_routines += bcopy-i386 bcopy-i686 bcopy-sse2-unaligned \ + bcopy-ssse3 bcopy-ssse3-rep \ + memcpy-i386 memcpy-i586 memcpy-i686 \ + memcpy-sse2-unaligned memcpy-ssse3 memcpy-ssse3-rep \ + memmove-i386 memmove-i686 memmove-sse2-unaligned \ + memmove-ssse3 memmove-ssse3-rep \ + mempcpy-i386 mempcpy-i586 mempcpy-i686 \ + mempcpy-sse2-unaligned mempcpy-ssse3 mempcpy-ssse3-rep \ + static-memcpy static-memmove static-mempcpy \ + bzero-i386 bzero-i586 bzero-i686 \ bzero-sse2 bzero-sse2-rep \ memset-i386 memset-i586 memset-i686 \ memset-sse2 memset-sse2-rep diff --git a/sysdeps/i386/multiarch/bcopy-i386.S b/sysdeps/i386/multiarch/bcopy-i386.S new file mode 100644 index 0000000000..dbc9bd176f --- /dev/null +++ b/sysdeps/i386/multiarch/bcopy-i386.S @@ -0,0 +1,12 @@ +#define bcopy __bcopy_i386 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) +#undef weak_alias +#define weak_alias(name, aliasname) +#include + +#ifdef SHARED + .globl __GI_bcopy + .hidden __GI_bcopy + __GI_bcopy = __bcopy_i386 +#endif diff --git a/sysdeps/i386/multiarch/bcopy-i686.S b/sysdeps/i386/multiarch/bcopy-i686.S new file mode 100644 index 0000000000..335ac858e1 --- /dev/null +++ b/sysdeps/i386/multiarch/bcopy-i686.S @@ -0,0 +1,6 @@ +#define bcopy __bcopy_i686 +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) +#undef weak_alias +#define weak_alias(name, aliasname) +#include diff --git a/sysdeps/i386/multiarch/bcopy-sse2-unaligned.S b/sysdeps/i386/multiarch/bcopy-sse2-unaligned.S new file mode 100644 index 0000000000..efef2a10dd --- /dev/null +++ b/sysdeps/i386/multiarch/bcopy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/sysdeps/i386/multiarch/bcopy-ssse3-rep.S b/sysdeps/i386/multiarch/bcopy-ssse3-rep.S new file mode 100644 index 0000000000..cbc8b420e8 --- /dev/null +++ b/sysdeps/i386/multiarch/bcopy-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/sysdeps/i386/multiarch/bcopy-ssse3.S b/sysdeps/i386/multiarch/bcopy-ssse3.S new file mode 100644 index 0000000000..36aac44b9c --- /dev/null +++ b/sysdeps/i386/multiarch/bcopy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_ssse3 +#include "memcpy-ssse3.S" diff --git a/sysdeps/i386/multiarch/bcopy.c b/sysdeps/i386/multiarch/bcopy.c new file mode 100644 index 0000000000..eefbd4eb86 --- /dev/null +++ b/sysdeps/i386/multiarch/bcopy.c @@ -0,0 +1,64 @@ +/* Multiple versions of bcopy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) +/* Redefine bcopy so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef bcopy +# define bcopy __redirect_bcopy +# include +# undef bcopy + +# include + +extern __typeof (__redirect_bcopy) __bcopy_i386 attribute_hidden; +extern __typeof (__redirect_bcopy) __bcopy_i686 attribute_hidden; +extern __typeof (__redirect_bcopy) __bcopy_sse2_unaligned attribute_hidden; +extern __typeof (__redirect_bcopy) __bcopy_ssse3 attribute_hidden; +extern __typeof (__redirect_bcopy) __bcopy_ssse3_rep attribute_hidden; + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_bcopy) bcopy; +extern void *bcopy_ifunc (void) __asm__ ("bcopy"); + +void * +bcopy_ifunc (void) +{ + if (HAS_CPU_FEATURE (SSE2)) + { + if (HAS_ARCH_FEATURE (Fast_Unaligned_Load)) + return __bcopy_sse2_unaligned; + else if (HAS_CPU_FEATURE (SSSE3)) + { + if (HAS_ARCH_FEATURE (Fast_Rep_String)) + return __bcopy_ssse3_rep; + else + return __bcopy_ssse3; + } + } + + if (USE_I686) + return __bcopy_i686; + else + return __bcopy_i386; +} +__asm__ (".type bcopy, %gnu_indirect_function"); +#endif diff --git a/sysdeps/i386/multiarch/ifunc-impl-list.c b/sysdeps/i386/multiarch/ifunc-impl-list.c index 7bde24e112..d5df5de426 100644 --- a/sysdeps/i386/multiarch/ifunc-impl-list.c +++ b/sysdeps/i386/multiarch/ifunc-impl-list.c @@ -36,7 +36,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, size_t i = 0; -#if 0 /* Support sysdeps/i386/i686/multiarch/bcopy.S. */ IFUNC_IMPL (i, name, bcopy, IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3), @@ -45,8 +44,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __bcopy_ssse3) IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2), __bcopy_sse2_unaligned) - IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32)) + IFUNC_IMPL_ADD (array, i, bcopy, HAS_I686, __bcopy_i686) +#if MINIMUM_ISA < 686 + IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_i386) #endif + ) /* Support sysdeps/i386/i686/multiarch/bzero.S. */ IFUNC_IMPL (i, name, bzero, @@ -77,6 +79,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), __memcmp_ssse3) IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32)) +#endif /* Support sysdeps/i386/i686/multiarch/memmove_chk.S. */ IFUNC_IMPL (i, name, __memmove_chk, @@ -89,8 +92,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_CPU_FEATURE (SSE2), __memmove_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_I686, + __memmove_chk_i686) +#if MINIMUM_ISA < 686 IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, - __memmove_chk_ia32)) + __memmove_chk_i386) +#endif + ) /* Support sysdeps/i386/i686/multiarch/memmove.S. */ IFUNC_IMPL (i, name, memmove, @@ -100,8 +108,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2), __memmove_sse2_unaligned) - IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32)) + IFUNC_IMPL_ADD (array, i, memmove, HAS_I686, __memmove_i686) +#if MINIMUM_ISA < 686 + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_i386) +#endif + ) +#if 0 /* Support sysdeps/i386/i686/multiarch/memrchr.S. */ IFUNC_IMPL (i, name, memrchr, IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2), @@ -329,6 +342,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3), __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32)) +#endif #ifdef SHARED /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S. */ @@ -342,8 +356,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_CPU_FEATURE (SSE2), __memcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_I686, + __memcpy_chk_i686) +# if MINIMUM_ISA < 686 + IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_I586, + __memcpy_chk_i586) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, - __memcpy_chk_ia32)) + __memcpy_chk_i386) +# endif + ) /* Support sysdeps/i386/i686/multiarch/memcpy.S. */ IFUNC_IMPL (i, name, memcpy, @@ -353,7 +374,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memcpy_ssse3) IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2), __memcpy_sse2_unaligned) - IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32)) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_I686, __memcpy_i686) +#if MINIMUM_ISA < 686 + IFUNC_IMPL_ADD (array, i, memcpy, HAS_I586, __memcpy_i586) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_i386) +# endif + ) /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S. */ IFUNC_IMPL (i, name, __mempcpy_chk, @@ -366,8 +392,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_CPU_FEATURE (SSE2), __mempcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_I686, + __mempcpy_chk_i686) +# if MINIMUM_ISA < 686 + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_I586, + __mempcpy_chk_i586) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, - __mempcpy_chk_ia32)) + __mempcpy_chk_i386) +# endif + ) /* Support sysdeps/i386/i686/multiarch/mempcpy.S. */ IFUNC_IMPL (i, name, mempcpy, @@ -377,8 +410,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2), __mempcpy_sse2_unaligned) - IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32)) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_I686, __mempcpy_i686) +# if MINIMUM_ISA < 686 + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_I586, __mempcpy_i586) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_i386) +# endif + ) +#if 0 /* Support sysdeps/i386/i686/multiarch/strlen.S. */ IFUNC_IMPL (i, name, strlen, IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2), diff --git a/sysdeps/i386/multiarch/memcpy-i386.S b/sysdeps/i386/multiarch/memcpy-i386.S new file mode 100644 index 0000000000..d26b195369 --- /dev/null +++ b/sysdeps/i386/multiarch/memcpy-i386.S @@ -0,0 +1,11 @@ +#ifdef SHARED +# define memcpy __memcpy_i386 +# define __memcpy_chk __memcpy_chk_i386 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# include + + .globl __GI_memcpy + .hidden __GI_memcpy + __GI_memcpy = __memcpy_i386 +#endif diff --git a/sysdeps/i386/multiarch/memcpy-i586.S b/sysdeps/i386/multiarch/memcpy-i586.S new file mode 100644 index 0000000000..a9d89d97d1 --- /dev/null +++ b/sysdeps/i386/multiarch/memcpy-i586.S @@ -0,0 +1,7 @@ +#ifdef SHARED +# define memcpy __memcpy_i586 +# define __memcpy_chk __memcpy_chk_i586 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# include +#endif diff --git a/sysdeps/i386/multiarch/memcpy-i686.S b/sysdeps/i386/multiarch/memcpy-i686.S new file mode 100644 index 0000000000..c5f516e931 --- /dev/null +++ b/sysdeps/i386/multiarch/memcpy-i686.S @@ -0,0 +1,7 @@ +#ifdef SHARED +# define memcpy __memcpy_i686 +# define __memcpy_chk __memcpy_chk_i686 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# include +#endif diff --git a/sysdeps/i386/multiarch/memcpy-sse2-unaligned.S b/sysdeps/i386/multiarch/memcpy-sse2-unaligned.S new file mode 100644 index 0000000000..8215c70b15 --- /dev/null +++ b/sysdeps/i386/multiarch/memcpy-sse2-unaligned.S @@ -0,0 +1,681 @@ +/* memcpy optimized with SSE2 unaligned memory access instructions. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +# include +# include "asm-syntax.h" + +# ifndef MEMCPY +# define MEMCPY __memcpy_sse2_unaligned +# define MEMCPY_CHK __memcpy_chk_sse2_unaligned +# endif + +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) + + .section .text.sse2,"ax",@progbits +# if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +# endif + +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + cmp %edx, %eax + +# ifdef USE_AS_MEMMOVE + jg L(check_forward) + +L(mm_len_0_or_more_backward): +/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] + separately. */ + cmp $16, %ecx + jbe L(mm_len_0_16_bytes_backward) + + cmpl $32, %ecx + jg L(mm_len_32_or_more_backward) + +/* Copy [0..32] and return. */ + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_32_or_more_backward): + cmpl $64, %ecx + jg L(mm_len_64_or_more_backward) + +/* Copy [0..64] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu -16(%eax, %ecx), %xmm2 + movdqu -32(%eax, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -16(%edx, %ecx) + movdqu %xmm3, -32(%edx, %ecx) + jmp L(return) + +L(mm_len_64_or_more_backward): + cmpl $128, %ecx + jg L(mm_len_128_or_more_backward) + +/* Copy [0..128] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_128_or_more_backward): + add %ecx, %eax + cmp %edx, %eax + movl SRC(%esp), %eax + jle L(forward) + PUSH (%esi) + PUSH (%edi) + PUSH (%ebx) + +/* Aligning the address of destination. */ + movdqu (%eax), %xmm4 + movdqu 16(%eax), %xmm5 + movdqu 32(%eax), %xmm6 + movdqu 48(%eax), %xmm7 + leal (%edx, %ecx), %esi + movdqu -16(%eax, %ecx), %xmm0 + subl $16, %esp + movdqu %xmm0, (%esp) + mov %ecx, %edi + movl %esi, %ecx + andl $-16, %ecx + leal (%ecx), %ebx + subl %edx, %ebx + leal (%eax, %ebx), %eax + shrl $6, %ebx + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %edi +# else +# ifdef SHARED + PUSH (%ebx) + SETUP_PIC_REG (bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi + POP (%ebx) +# else + cmp __x86_shared_cache_size_half, %edi +# endif +# endif + jae L(mm_large_page_loop_backward) + + .p2align 4 +L(mm_main_loop_backward): + + prefetcht0 -128(%eax) + + movdqu -64(%eax), %xmm0 + movdqu -48(%eax), %xmm1 + movdqu -32(%eax), %xmm2 + movdqu -16(%eax), %xmm3 + movaps %xmm0, -64(%ecx) + subl $64, %eax + movaps %xmm1, -48(%ecx) + movaps %xmm2, -32(%ecx) + movaps %xmm3, -16(%ecx) + subl $64, %ecx + sub $1, %ebx + jnz L(mm_main_loop_backward) + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, -16(%esi) + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + POP (%ebx) + jmp L(mm_return_pop_all) + +/* Copy [0..16] and return. */ +L(mm_len_0_16_bytes_backward): + testb $24, %cl + jnz L(mm_len_9_16_bytes_backward) + testb $4, %cl + .p2align 4,,5 + jnz L(mm_len_5_8_bytes_backward) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + testb $2, %cl + .p2align 4,,1 + jne L(mm_len_3_4_bytes_backward) + movzbl -1(%eax,%ecx), %ebx + movzbl (%eax), %eax + movb %bl, -1(%edx,%ecx) + movb %al, (%edx) + jmp L(return) + +L(mm_len_3_4_bytes_backward): + movzwl -2(%eax,%ecx), %ebx + movzwl (%eax), %eax + movw %bx, -2(%edx,%ecx) + movw %ax, (%edx) + jmp L(return) + +L(mm_len_9_16_bytes_backward): + PUSH (%esi) + movl -4(%eax,%ecx), %ebx + movl -8(%eax,%ecx), %esi + movl %ebx, -4(%edx,%ecx) + movl %esi, -8(%edx,%ecx) + subl $8, %ecx + POP (%esi) + jmp L(mm_len_0_16_bytes_backward) + +L(mm_len_5_8_bytes_backward): + movl (%eax), %ebx + movl -4(%eax,%ecx), %eax + movl %ebx, (%edx) + movl %eax, -4(%edx,%ecx) + jmp L(return) + +/* Big length copy backward part. */ + .p2align 4 +L(mm_large_page_loop_backward): + movdqu -64(%eax), %xmm0 + movdqu -48(%eax), %xmm1 + movdqu -32(%eax), %xmm2 + movdqu -16(%eax), %xmm3 + movntdq %xmm0, -64(%ecx) + subl $64, %eax + movntdq %xmm1, -48(%ecx) + movntdq %xmm2, -32(%ecx) + movntdq %xmm3, -16(%ecx) + subl $64, %ecx + sub $1, %ebx + jnz L(mm_large_page_loop_backward) + sfence + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, -16(%esi) + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + POP (%ebx) + jmp L(mm_return_pop_all) + +L(check_forward): + add %edx, %ecx + cmp %eax, %ecx + movl LEN(%esp), %ecx + jle L(forward) + +/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] + separately. */ + cmp $16, %ecx + jbe L(mm_len_0_16_bytes_forward) + + cmpl $32, %ecx + ja L(mm_len_32_or_more_forward) + +/* Copy [0..32] and return. */ + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_32_or_more_forward): + cmpl $64, %ecx + ja L(mm_len_64_or_more_forward) + +/* Copy [0..64] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu -16(%eax, %ecx), %xmm2 + movdqu -32(%eax, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -16(%edx, %ecx) + movdqu %xmm3, -32(%edx, %ecx) + jmp L(return) + +L(mm_len_64_or_more_forward): + cmpl $128, %ecx + ja L(mm_len_128_or_more_forward) + +/* Copy [0..128] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_128_or_more_forward): + PUSH (%esi) + PUSH (%edi) + PUSH (%ebx) + +/* Aligning the address of destination. */ + movdqu -16(%eax, %ecx), %xmm4 + movdqu -32(%eax, %ecx), %xmm5 + movdqu -48(%eax, %ecx), %xmm6 + movdqu -64(%eax, %ecx), %xmm7 + leal (%edx, %ecx), %esi + movdqu (%eax), %xmm0 + subl $16, %esp + movdqu %xmm0, (%esp) + mov %ecx, %edi + leal 16(%edx), %ecx + andl $-16, %ecx + movl %ecx, %ebx + subl %edx, %ebx + addl %ebx, %eax + movl %esi, %ebx + subl %ecx, %ebx + shrl $6, %ebx + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %edi +# else +# ifdef SHARED + PUSH (%ebx) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi + POP (%ebx) +# else + cmp __x86_shared_cache_size_half, %edi +# endif +# endif + jae L(mm_large_page_loop_forward) + + .p2align 4 +L(mm_main_loop_forward): + + prefetcht0 128(%eax) + + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqa %xmm0, (%ecx) + addl $64, %eax + movaps %xmm1, 16(%ecx) + movaps %xmm2, 32(%ecx) + movaps %xmm3, 48(%ecx) + addl $64, %ecx + sub $1, %ebx + jnz L(mm_main_loop_forward) + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, (%edx) + movdqu %xmm4, -16(%esi) + movdqu %xmm5, -32(%esi) + movdqu %xmm6, -48(%esi) + movdqu %xmm7, -64(%esi) + POP (%ebx) + jmp L(mm_return_pop_all) + +L(mm_len_0_16_bytes_forward): + testb $24, %cl + jne L(mm_len_9_16_bytes_forward) + testb $4, %cl + .p2align 4,,5 + jne L(mm_len_5_8_bytes_forward) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + testb $2, %cl + .p2align 4,,1 + jne L(mm_len_2_4_bytes_forward) + movzbl -1(%eax,%ecx), %ebx + movzbl (%eax), %eax + movb %bl, -1(%edx,%ecx) + movb %al, (%edx) + jmp L(return) + +L(mm_len_2_4_bytes_forward): + movzwl -2(%eax,%ecx), %ebx + movzwl (%eax), %eax + movw %bx, -2(%edx,%ecx) + movw %ax, (%edx) + jmp L(return) + +L(mm_len_5_8_bytes_forward): + movl (%eax), %ebx + movl -4(%eax,%ecx), %eax + movl %ebx, (%edx) + movl %eax, -4(%edx,%ecx) + jmp L(return) + +L(mm_len_9_16_bytes_forward): + movq (%eax), %xmm0 + movq -8(%eax, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(return) + +L(mm_return_pop_all): + movl %edx, %eax + POP (%edi) + POP (%esi) + RETURN + +/* Big length copy forward part. */ + .p2align 4 +L(mm_large_page_loop_forward): + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movntdq %xmm0, (%ecx) + addl $64, %eax + movntdq %xmm1, 16(%ecx) + movntdq %xmm2, 32(%ecx) + movntdq %xmm3, 48(%ecx) + addl $64, %ecx + sub $1, %ebx + jnz L(mm_large_page_loop_forward) + sfence + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, (%edx) + movdqu %xmm4, -16(%esi) + movdqu %xmm5, -32(%esi) + movdqu %xmm6, -48(%esi) + movdqu %xmm7, -64(%esi) + POP (%ebx) + jmp L(mm_return_pop_all) +# endif + +L(forward): + cmp $16, %ecx + jbe L(len_0_16_bytes) + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +# endif + jae L(large_page) + + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + cmpl $32, %ecx + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jbe L(return) + + movdqu 16(%eax), %xmm0 + movdqu -32(%eax, %ecx), %xmm1 + cmpl $64, %ecx + movdqu %xmm0, 16(%edx) + movdqu %xmm1, -32(%edx, %ecx) + jbe L(return) + + movdqu 32(%eax), %xmm0 + movdqu 48(%eax), %xmm1 + movdqu -48(%eax, %ecx), %xmm2 + movdqu -64(%eax, %ecx), %xmm3 + cmpl $128, %ecx + movdqu %xmm0, 32(%edx) + movdqu %xmm1, 48(%edx) + movdqu %xmm2, -48(%edx, %ecx) + movdqu %xmm3, -64(%edx, %ecx) + jbe L(return) + +/* Now the main loop: we align the address of the destination. */ + leal 64(%edx), %ebx + andl $-64, %ebx + + addl %edx, %ecx + andl $-64, %ecx + + subl %edx, %eax + +/* We should stop two iterations before the termination + (in order not to misprefetch). */ + subl $64, %ecx + cmpl %ebx, %ecx + je L(main_loop_just_one_iteration) + + subl $64, %ecx + cmpl %ebx, %ecx + je L(main_loop_last_two_iterations) + + .p2align 4 +L(main_loop_cache): + + prefetcht0 128(%ebx, %eax) + + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + lea 64(%ebx), %ebx + cmpl %ebx, %ecx + jne L(main_loop_cache) + +L(main_loop_last_two_iterations): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqu 64(%ebx, %eax), %xmm4 + movdqu 80(%ebx, %eax), %xmm5 + movdqu 96(%ebx, %eax), %xmm6 + movdqu 112(%ebx, %eax), %xmm7 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + movaps %xmm4, 64(%ebx) + movaps %xmm5, 80(%ebx) + movaps %xmm6, 96(%ebx) + movaps %xmm7, 112(%ebx) + jmp L(return) + +L(main_loop_just_one_iteration): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + jmp L(return) + +L(large_page): + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + + movdqu 64(%eax), %xmm0 + movdqu 80(%eax), %xmm1 + movdqu 96(%eax), %xmm2 + movdqu 112(%eax), %xmm3 + movdqu -128(%eax, %ecx), %xmm4 + movdqu -112(%eax, %ecx), %xmm5 + movdqu -96(%eax, %ecx), %xmm6 + movdqu -80(%eax, %ecx), %xmm7 + movdqu %xmm0, 64(%edx) + movdqu %xmm1, 80(%edx) + movdqu %xmm2, 96(%edx) + movdqu %xmm3, 112(%edx) + movdqu %xmm4, -128(%edx, %ecx) + movdqu %xmm5, -112(%edx, %ecx) + movdqu %xmm6, -96(%edx, %ecx) + movdqu %xmm7, -80(%edx, %ecx) + +/* Now the main loop with non temporal stores. We align + the address of the destination. */ + leal 128(%edx), %ebx + andl $-128, %ebx + + addl %edx, %ecx + andl $-128, %ecx + + subl %edx, %eax + + .p2align 4 +L(main_loop_large_page): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqu 64(%ebx, %eax), %xmm4 + movdqu 80(%ebx, %eax), %xmm5 + movdqu 96(%ebx, %eax), %xmm6 + movdqu 112(%ebx, %eax), %xmm7 + movntdq %xmm0, (%ebx) + movntdq %xmm1, 16(%ebx) + movntdq %xmm2, 32(%ebx) + movntdq %xmm3, 48(%ebx) + movntdq %xmm4, 64(%ebx) + movntdq %xmm5, 80(%ebx) + movntdq %xmm6, 96(%ebx) + movntdq %xmm7, 112(%ebx) + lea 128(%ebx), %ebx + cmpl %ebx, %ecx + jne L(main_loop_large_page) + sfence + jmp L(return) + +L(len_0_16_bytes): + testb $24, %cl + jne L(len_9_16_bytes) + testb $4, %cl + .p2align 4,,5 + jne L(len_5_8_bytes) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + movzbl (%eax), %ebx + testb $2, %cl + movb %bl, (%edx) + je L(return) + movzwl -2(%eax,%ecx), %ebx + movw %bx, -2(%edx,%ecx) + jmp L(return) + +L(len_9_16_bytes): + movq (%eax), %xmm0 + movq -8(%eax, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(return) + +L(len_5_8_bytes): + movl (%eax), %ebx + movl %ebx, (%edx) + movl -4(%eax,%ecx), %ebx + movl %ebx, -4(%edx,%ecx) + +L(return): + movl %edx, %eax +# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif + RETURN + +END (MEMCPY) +#endif diff --git a/sysdeps/i386/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/multiarch/memcpy-ssse3-rep.S new file mode 100644 index 0000000000..08d877c03a --- /dev/null +++ b/sysdeps/i386/multiarch/memcpy-ssse3-rep.S @@ -0,0 +1,1809 @@ +/* memcpy with SSSE3 and REP string. + Copyright (C) 2010-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3_rep +# define MEMCPY_CHK __memcpy_chk_ssse3_rep +#endif + +#ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +#else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ + addl $(TABLE - .), %ebx + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +#else +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +#endif + + .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +#ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $48, %ecx + jb L(bk_write_less48bytes) + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +#endif + cmp $48, %ecx + jae L(48bytesormore) + +L(fwd_write_less32bytes): +#ifndef USE_AS_MEMMOVE + cmp %dl, %al + jb L(bk_write) +#endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +#ifndef USE_AS_MEMMOVE +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +#endif + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(48bytesormore): + movdqu (%eax), %xmm0 + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + PUSH (%esi) + cfi_remember_state + add $16, %edx + movl %edi, %esi + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +#ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +#endif + + mov %eax, %edi + jae L(large_page) + and $0xf, %edi + jz L(shl_0) + + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + ALIGN (4) +L(shl_0): + movdqu %xmm0, (%esi) + xor %edi, %edi + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state +L(shl_0_gobble): + +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi +# else + mov __x86_data_cache_size_half, %edi +# endif +#endif + mov %edi, %esi + shr $3, %esi + sub %esi, %edi + cmp %edi, %ecx + jae L(shl_0_gobble_mem_start) + sub $128, %ecx + ALIGN (4) +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movaps 0x10(%eax), %xmm1 + movaps 0x20(%eax), %xmm2 + movaps 0x30(%eax), %xmm3 + movaps 0x40(%eax), %xmm4 + movaps 0x50(%eax), %xmm5 + movaps 0x60(%eax), %xmm6 + movaps 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm1, 0x10(%edx) + movaps %xmm2, 0x20(%edx) + movaps %xmm3, 0x30(%edx) + movaps %xmm4, 0x40(%edx) + movaps %xmm5, 0x50(%edx) + movaps %xmm6, 0x60(%edx) + movaps %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_cache_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_0_gobble_mem_start): + cmp %al, %dl + je L(copy_page_by_rep) + sub $128, %ecx +L(shl_0_gobble_mem_loop): + prefetchnta 0x1c0(%eax) + prefetchnta 0x280(%eax) + prefetchnta 0x1c0(%edx) + prefetchnta 0x280(%edx) + + movdqa (%eax), %xmm0 + movaps 0x10(%eax), %xmm1 + movaps 0x20(%eax), %xmm2 + movaps 0x30(%eax), %xmm3 + movaps 0x40(%eax), %xmm4 + movaps 0x50(%eax), %xmm5 + movaps 0x60(%eax), %xmm6 + movaps 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movaps %xmm1, 0x10(%edx) + movaps %xmm2, 0x20(%edx) + movaps %xmm3, 0x30(%edx) + movaps %xmm4, 0x40(%edx) + movaps %xmm5, 0x50(%edx) + movaps %xmm6, 0x60(%edx) + movaps %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_mem_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_1): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $1, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_1_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_1_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_1_loop) + +L(shl_1_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_2): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $2, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_2_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_2_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_2_loop) + +L(shl_2_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_3): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $3, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_3_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_3_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_3_loop) + +L(shl_3_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_4): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $4, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_4_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_4_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_4_loop) + +L(shl_4_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_5): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $5, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_5_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_5_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_5_loop) + +L(shl_5_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_6): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $6, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_6_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_6_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_6_loop) + +L(shl_6_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_7): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $7, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_7_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_7_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_7_loop) + +L(shl_7_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_8): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $8, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_8_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_8_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_8_loop) + +L(shl_8_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_9): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $9, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_9_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_9_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_9_loop) + +L(shl_9_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_10): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $10, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_10_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_10_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_10_loop) + +L(shl_10_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_11): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $11, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_11_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_11_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_11_loop) + +L(shl_11_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_12): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $12, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_12_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_12_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_12_loop) + +L(shl_12_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_13): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $13, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_13_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_13_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_13_loop) + +L(shl_13_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_14): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $14, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_14_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_14_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_14_loop) + +L(shl_14_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_15): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $15, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_15_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_15_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_15_loop) + +L(shl_15_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(fwd_write_44bytes): + movl -44(%eax), %ecx + movl %ecx, -44(%edx) +L(fwd_write_40bytes): + movl -40(%eax), %ecx + movl %ecx, -40(%edx) +L(fwd_write_36bytes): + movl -36(%eax), %ecx + movl %ecx, -36(%edx) +L(fwd_write_32bytes): + movl -32(%eax), %ecx + movl %ecx, -32(%edx) +L(fwd_write_28bytes): + movl -28(%eax), %ecx + movl %ecx, -28(%edx) +L(fwd_write_24bytes): + movl -24(%eax), %ecx + movl %ecx, -24(%edx) +L(fwd_write_20bytes): + movl -20(%eax), %ecx + movl %ecx, -20(%edx) +L(fwd_write_16bytes): + movl -16(%eax), %ecx + movl %ecx, -16(%edx) +L(fwd_write_12bytes): + movl -12(%eax), %ecx + movl %ecx, -12(%edx) +L(fwd_write_8bytes): + movl -8(%eax), %ecx + movl %ecx, -8(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +L(fwd_write_0bytes): +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_45bytes): + movl -45(%eax), %ecx + movl %ecx, -45(%edx) +L(fwd_write_41bytes): + movl -41(%eax), %ecx + movl %ecx, -41(%edx) +L(fwd_write_37bytes): + movl -37(%eax), %ecx + movl %ecx, -37(%edx) +L(fwd_write_33bytes): + movl -33(%eax), %ecx + movl %ecx, -33(%edx) +L(fwd_write_29bytes): + movl -29(%eax), %ecx + movl %ecx, -29(%edx) +L(fwd_write_25bytes): + movl -25(%eax), %ecx + movl %ecx, -25(%edx) +L(fwd_write_21bytes): + movl -21(%eax), %ecx + movl %ecx, -21(%edx) +L(fwd_write_17bytes): + movl -17(%eax), %ecx + movl %ecx, -17(%edx) +L(fwd_write_13bytes): + movl -13(%eax), %ecx + movl %ecx, -13(%edx) +L(fwd_write_9bytes): + movl -9(%eax), %ecx + movl %ecx, -9(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_46bytes): + movl -46(%eax), %ecx + movl %ecx, -46(%edx) +L(fwd_write_42bytes): + movl -42(%eax), %ecx + movl %ecx, -42(%edx) +L(fwd_write_38bytes): + movl -38(%eax), %ecx + movl %ecx, -38(%edx) +L(fwd_write_34bytes): + movl -34(%eax), %ecx + movl %ecx, -34(%edx) +L(fwd_write_30bytes): + movl -30(%eax), %ecx + movl %ecx, -30(%edx) +L(fwd_write_26bytes): + movl -26(%eax), %ecx + movl %ecx, -26(%edx) +L(fwd_write_22bytes): + movl -22(%eax), %ecx + movl %ecx, -22(%edx) +L(fwd_write_18bytes): + movl -18(%eax), %ecx + movl %ecx, -18(%edx) +L(fwd_write_14bytes): + movl -14(%eax), %ecx + movl %ecx, -14(%edx) +L(fwd_write_10bytes): + movl -10(%eax), %ecx + movl %ecx, -10(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_47bytes): + movl -47(%eax), %ecx + movl %ecx, -47(%edx) +L(fwd_write_43bytes): + movl -43(%eax), %ecx + movl %ecx, -43(%edx) +L(fwd_write_39bytes): + movl -39(%eax), %ecx + movl %ecx, -39(%edx) +L(fwd_write_35bytes): + movl -35(%eax), %ecx + movl %ecx, -35(%edx) +L(fwd_write_31bytes): + movl -31(%eax), %ecx + movl %ecx, -31(%edx) +L(fwd_write_27bytes): + movl -27(%eax), %ecx + movl %ecx, -27(%edx) +L(fwd_write_23bytes): + movl -23(%eax), %ecx + movl %ecx, -23(%edx) +L(fwd_write_19bytes): + movl -19(%eax), %ecx + movl %ecx, -19(%edx) +L(fwd_write_15bytes): + movl -15(%eax), %ecx + movl %ecx, -15(%edx) +L(fwd_write_11bytes): + movl -11(%eax), %ecx + movl %ecx, -11(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN_END + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(large_page): + movdqu (%eax), %xmm1 + movdqu %xmm0, (%esi) + movntdq %xmm1, (%edx) + add $0x10, %eax + add $0x10, %edx + sub $0x10, %ecx + cmp %al, %dl + je L(copy_page_by_rep) +L(large_page_loop_init): + POP (%esi) + sub $0x80, %ecx + POP (%edi) +L(large_page_loop): + prefetchnta 0x1c0(%eax) + prefetchnta 0x280(%eax) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + lfence + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jb L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(copy_page_by_rep): + mov %eax, %esi + mov %edx, %edi + mov %ecx, %edx + shr $2, %ecx + and $3, %edx + rep movsl + jz L(copy_page_by_rep_exit) + cmp $2, %edx + jb L(copy_page_by_rep_left_1) + movzwl (%esi), %eax + movw %ax, (%edi) + add $2, %esi + add $2, %edi + sub $2, %edx + jz L(copy_page_by_rep_exit) +L(copy_page_by_rep_left_1): + movzbl (%esi), %eax + movb %al, (%edi) +L(copy_page_by_rep_exit): + POP (%esi) + POP (%edi) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_44bytes): + movl 40(%eax), %ecx + movl %ecx, 40(%edx) +L(bk_write_40bytes): + movl 36(%eax), %ecx + movl %ecx, 36(%edx) +L(bk_write_36bytes): + movl 32(%eax), %ecx + movl %ecx, 32(%edx) +L(bk_write_32bytes): + movl 28(%eax), %ecx + movl %ecx, 28(%edx) +L(bk_write_28bytes): + movl 24(%eax), %ecx + movl %ecx, 24(%edx) +L(bk_write_24bytes): + movl 20(%eax), %ecx + movl %ecx, 20(%edx) +L(bk_write_20bytes): + movl 16(%eax), %ecx + movl %ecx, 16(%edx) +L(bk_write_16bytes): + movl 12(%eax), %ecx + movl %ecx, 12(%edx) +L(bk_write_12bytes): + movl 8(%eax), %ecx + movl %ecx, 8(%edx) +L(bk_write_8bytes): + movl 4(%eax), %ecx + movl %ecx, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_45bytes): + movl 41(%eax), %ecx + movl %ecx, 41(%edx) +L(bk_write_41bytes): + movl 37(%eax), %ecx + movl %ecx, 37(%edx) +L(bk_write_37bytes): + movl 33(%eax), %ecx + movl %ecx, 33(%edx) +L(bk_write_33bytes): + movl 29(%eax), %ecx + movl %ecx, 29(%edx) +L(bk_write_29bytes): + movl 25(%eax), %ecx + movl %ecx, 25(%edx) +L(bk_write_25bytes): + movl 21(%eax), %ecx + movl %ecx, 21(%edx) +L(bk_write_21bytes): + movl 17(%eax), %ecx + movl %ecx, 17(%edx) +L(bk_write_17bytes): + movl 13(%eax), %ecx + movl %ecx, 13(%edx) +L(bk_write_13bytes): + movl 9(%eax), %ecx + movl %ecx, 9(%edx) +L(bk_write_9bytes): + movl 5(%eax), %ecx + movl %ecx, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_46bytes): + movl 42(%eax), %ecx + movl %ecx, 42(%edx) +L(bk_write_42bytes): + movl 38(%eax), %ecx + movl %ecx, 38(%edx) +L(bk_write_38bytes): + movl 34(%eax), %ecx + movl %ecx, 34(%edx) +L(bk_write_34bytes): + movl 30(%eax), %ecx + movl %ecx, 30(%edx) +L(bk_write_30bytes): + movl 26(%eax), %ecx + movl %ecx, 26(%edx) +L(bk_write_26bytes): + movl 22(%eax), %ecx + movl %ecx, 22(%edx) +L(bk_write_22bytes): + movl 18(%eax), %ecx + movl %ecx, 18(%edx) +L(bk_write_18bytes): + movl 14(%eax), %ecx + movl %ecx, 14(%edx) +L(bk_write_14bytes): + movl 10(%eax), %ecx + movl %ecx, 10(%edx) +L(bk_write_10bytes): + movl 6(%eax), %ecx + movl %ecx, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_47bytes): + movl 43(%eax), %ecx + movl %ecx, 43(%edx) +L(bk_write_43bytes): + movl 39(%eax), %ecx + movl %ecx, 39(%edx) +L(bk_write_39bytes): + movl 35(%eax), %ecx + movl %ecx, 35(%edx) +L(bk_write_35bytes): + movl 31(%eax), %ecx + movl %ecx, 31(%edx) +L(bk_write_31bytes): + movl 27(%eax), %ecx + movl %ecx, 27(%edx) +L(bk_write_27bytes): + movl 23(%eax), %ecx + movl %ecx, 23(%edx) +L(bk_write_23bytes): + movl 19(%eax), %ecx + movl %ecx, 19(%edx) +L(bk_write_19bytes): + movl 15(%eax), %ecx + movl %ecx, 15(%edx) +L(bk_write_15bytes): + movl 11(%eax), %ecx + movl %ecx, 11(%edx) +L(bk_write_11bytes): + movl 7(%eax), %ecx + movl %ecx, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + ALIGN (2) +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + ALIGN (2) +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + ALIGN (2) +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +#ifdef USE_AS_MEMMOVE + ALIGN (4) +L(copy_backward): + PUSH (%esi) + movl %eax, %esi + add %ecx, %edx + add %ecx, %esi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jae L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jb L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movl -4(%esi), %eax + movl %eax, -4(%edx) + movl -8(%esi), %eax + movl %eax, -8(%edx) + movl -12(%esi), %eax + movl %eax, -12(%edx) + movl -16(%esi), %eax + movl %eax, -16(%edx) + movl -20(%esi), %eax + movl %eax, -20(%edx) + movl -24(%esi), %eax + movl %eax, -24(%edx) + movl -28(%esi), %eax + movl %eax, -28(%edx) + movl -32(%esi), %eax + movl %eax, -32(%edx) + sub $32, %edx + sub $32, %esi + +L(bk_write_less32bytes): + movl %esi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%esi) +L(bk_write_less48bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + CFI_PUSH (%esi) + ALIGN (4) +L(bk_align): + cmp $8, %ecx + jbe L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %esi + sub $1, %ecx + sub $1, %edx + movzbl (%esi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %esi + sub $2, %ecx + sub $2, %edx + movzwl (%esi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + ALIGN (4) +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jb L(bk_write_more32bytes) + +L(bk_ssse3_cpy): + sub $64, %esi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%esi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%esi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%esi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%esi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jae L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +#endif + +END (MEMCPY) + +#endif diff --git a/sysdeps/i386/multiarch/memcpy-ssse3.S b/sysdeps/i386/multiarch/memcpy-ssse3.S new file mode 100644 index 0000000000..27ab6a2c3e --- /dev/null +++ b/sysdeps/i386/multiarch/memcpy-ssse3.S @@ -0,0 +1,3162 @@ +/* memcpy with SSSE3 + Copyright (C) 2010-2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +# include +# include "asm-syntax.h" + +# ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# endif + +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx, INDEX, SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +# else + +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(, INDEX, SCALE) +# endif + + .section .text.ssse3,"ax",@progbits +# if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +# endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +# ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $32, %ecx + jae L(memmove_bwd) + jmp L(bk_write_less32bytes_2) + + .p2align 4 +L(memmove_bwd): + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +# endif + cmp $48, %ecx + jae L(48bytesormore) + +L(fwd_write_less32bytes): +# ifndef USE_AS_MEMMOVE + cmp %dl, %al + jb L(bk_write) +# endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +# ifndef USE_AS_MEMMOVE + .p2align 4 +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +# endif + + .p2align 4 +L(48bytesormore): +# ifndef USE_AS_MEMMOVE + movlpd (%eax), %xmm0 + movlpd 8(%eax), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) +# else + movdqu (%eax), %xmm0 +# endif + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + add $16, %edx + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +# endif + + mov %eax, %edi + jae L(large_page) + and $0xf, %edi + jz L(shl_0) + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + .p2align 4 +L(shl_0): +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif + xor %edi, %edi + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx + + .p2align 4 +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_0_gobble): +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + POP (%edi) + lea -128(%ecx), %ecx + jae L(shl_0_gobble_mem_loop) + + .p2align 4 +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_cache_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx + +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx + +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx + +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(shl_0_gobble_mem_loop): + prefetcht0 0x1c0(%eax) + prefetcht0 0x280(%eax) + prefetcht0 0x1c0(%edx) + + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_mem_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx + +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx + +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx + +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) + + .p2align 4 +L(shl_1): +# ifndef USE_AS_MEMMOVE + movaps -1(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -1(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_1_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl1LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + movaps 47(%eax), %xmm4 + movaps 63(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + palignr $1, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $1, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $1, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl1LoopStart) + +L(Shl1LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_1_no_prefetch): + lea -32(%ecx), %ecx + lea -1(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_1_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_1_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_1_no_prefetch_loop) + +L(sh_1_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_2): +# ifndef USE_AS_MEMMOVE + movaps -2(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -2(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_2_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl2LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + movaps 46(%eax), %xmm4 + movaps 62(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + palignr $2, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $2, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $2, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl2LoopStart) + +L(Shl2LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_2_no_prefetch): + lea -32(%ecx), %ecx + lea -2(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_2_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_2_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_2_no_prefetch_loop) + +L(sh_2_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_3): +# ifndef USE_AS_MEMMOVE + movaps -3(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -3(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_3_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl3LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + movaps 45(%eax), %xmm4 + movaps 61(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + palignr $3, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $3, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $3, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl3LoopStart) + +L(Shl3LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_3_no_prefetch): + lea -32(%ecx), %ecx + lea -3(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_3_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_3_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_3_no_prefetch_loop) + +L(sh_3_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_4): +# ifndef USE_AS_MEMMOVE + movaps -4(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -4(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_4_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl4LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + movaps 44(%eax), %xmm4 + movaps 60(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + palignr $4, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $4, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $4, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl4LoopStart) + +L(Shl4LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_4_no_prefetch): + lea -32(%ecx), %ecx + lea -4(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_4_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_4_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_4_no_prefetch_loop) + +L(sh_4_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_5): +# ifndef USE_AS_MEMMOVE + movaps -5(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -5(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_5_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl5LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + movaps 43(%eax), %xmm4 + movaps 59(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + palignr $5, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $5, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $5, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl5LoopStart) + +L(Shl5LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_5_no_prefetch): + lea -32(%ecx), %ecx + lea -5(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_5_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_5_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_5_no_prefetch_loop) + +L(sh_5_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_6): +# ifndef USE_AS_MEMMOVE + movaps -6(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -6(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_6_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl6LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + movaps 42(%eax), %xmm4 + movaps 58(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + palignr $6, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $6, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $6, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl6LoopStart) + +L(Shl6LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_6_no_prefetch): + lea -32(%ecx), %ecx + lea -6(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_6_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_6_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_6_no_prefetch_loop) + +L(sh_6_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_7): +# ifndef USE_AS_MEMMOVE + movaps -7(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -7(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_7_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl7LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + movaps 41(%eax), %xmm4 + movaps 57(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + palignr $7, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $7, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $7, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl7LoopStart) + +L(Shl7LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_7_no_prefetch): + lea -32(%ecx), %ecx + lea -7(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_7_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_7_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_7_no_prefetch_loop) + +L(sh_7_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_8): +# ifndef USE_AS_MEMMOVE + movaps -8(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -8(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_8_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl8LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + movaps 40(%eax), %xmm4 + movaps 56(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + palignr $8, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $8, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $8, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl8LoopStart) + +L(LoopLeave8): + add $32, %ecx + jle L(shl_end_0) + + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_8_no_prefetch): + lea -32(%ecx), %ecx + lea -8(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_8_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_8_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_8_no_prefetch_loop) + +L(sh_8_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_9): +# ifndef USE_AS_MEMMOVE + movaps -9(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -9(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_9_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl9LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + movaps 39(%eax), %xmm4 + movaps 55(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + palignr $9, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $9, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $9, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl9LoopStart) + +L(Shl9LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_9_no_prefetch): + lea -32(%ecx), %ecx + lea -9(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_9_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_9_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_9_no_prefetch_loop) + +L(sh_9_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_10): +# ifndef USE_AS_MEMMOVE + movaps -10(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -10(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_10_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl10LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + movaps 38(%eax), %xmm4 + movaps 54(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + palignr $10, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $10, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $10, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl10LoopStart) + +L(Shl10LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_10_no_prefetch): + lea -32(%ecx), %ecx + lea -10(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_10_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_10_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_10_no_prefetch_loop) + +L(sh_10_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_11): +# ifndef USE_AS_MEMMOVE + movaps -11(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -11(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_11_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl11LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + movaps 37(%eax), %xmm4 + movaps 53(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + palignr $11, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $11, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $11, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl11LoopStart) + +L(Shl11LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_11_no_prefetch): + lea -32(%ecx), %ecx + lea -11(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_11_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_11_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_11_no_prefetch_loop) + +L(sh_11_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_12): +# ifndef USE_AS_MEMMOVE + movaps -12(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -12(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_12_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl12LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + movaps 36(%eax), %xmm4 + movaps 52(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + palignr $12, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $12, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $12, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl12LoopStart) + +L(Shl12LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_12_no_prefetch): + lea -32(%ecx), %ecx + lea -12(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_12_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_12_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_12_no_prefetch_loop) + +L(sh_12_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_13): +# ifndef USE_AS_MEMMOVE + movaps -13(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -13(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_13_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl13LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + movaps 35(%eax), %xmm4 + movaps 51(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + palignr $13, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $13, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $13, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl13LoopStart) + +L(Shl13LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_13_no_prefetch): + lea -32(%ecx), %ecx + lea -13(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_13_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_13_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_13_no_prefetch_loop) + +L(sh_13_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_14): +# ifndef USE_AS_MEMMOVE + movaps -14(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -14(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_14_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl14LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + movaps 34(%eax), %xmm4 + movaps 50(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + palignr $14, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $14, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $14, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl14LoopStart) + +L(Shl14LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_14_no_prefetch): + lea -32(%ecx), %ecx + lea -14(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_14_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_14_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_14_no_prefetch_loop) + +L(sh_14_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_15): +# ifndef USE_AS_MEMMOVE + movaps -15(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -15(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_15_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl15LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + movaps 33(%eax), %xmm4 + movaps 49(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + palignr $15, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $15, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $15, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl15LoopStart) + +L(Shl15LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_15_no_prefetch): + lea -32(%ecx), %ecx + lea -15(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_15_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_15_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_15_no_prefetch_loop) + +L(sh_15_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_end_0): + lea 32(%ecx), %ecx + lea (%edx, %ecx), %edx + lea (%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(fwd_write_44bytes): + movq -44(%eax), %xmm0 + movq %xmm0, -44(%edx) +L(fwd_write_36bytes): + movq -36(%eax), %xmm0 + movq %xmm0, -36(%edx) +L(fwd_write_28bytes): + movq -28(%eax), %xmm0 + movq %xmm0, -28(%edx) +L(fwd_write_20bytes): + movq -20(%eax), %xmm0 + movq %xmm0, -20(%edx) +L(fwd_write_12bytes): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_40bytes): + movq -40(%eax), %xmm0 + movq %xmm0, -40(%edx) +L(fwd_write_32bytes): + movq -32(%eax), %xmm0 + movq %xmm0, -32(%edx) +L(fwd_write_24bytes): + movq -24(%eax), %xmm0 + movq %xmm0, -24(%edx) +L(fwd_write_16bytes): + movq -16(%eax), %xmm0 + movq %xmm0, -16(%edx) +L(fwd_write_8bytes): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes): +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_45bytes): + movq -45(%eax), %xmm0 + movq %xmm0, -45(%edx) +L(fwd_write_37bytes): + movq -37(%eax), %xmm0 + movq %xmm0, -37(%edx) +L(fwd_write_29bytes): + movq -29(%eax), %xmm0 + movq %xmm0, -29(%edx) +L(fwd_write_21bytes): + movq -21(%eax), %xmm0 + movq %xmm0, -21(%edx) +L(fwd_write_13bytes): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_41bytes): + movq -41(%eax), %xmm0 + movq %xmm0, -41(%edx) +L(fwd_write_33bytes): + movq -33(%eax), %xmm0 + movq %xmm0, -33(%edx) +L(fwd_write_25bytes): + movq -25(%eax), %xmm0 + movq %xmm0, -25(%edx) +L(fwd_write_17bytes): + movq -17(%eax), %xmm0 + movq %xmm0, -17(%edx) +L(fwd_write_9bytes): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_46bytes): + movq -46(%eax), %xmm0 + movq %xmm0, -46(%edx) +L(fwd_write_38bytes): + movq -38(%eax), %xmm0 + movq %xmm0, -38(%edx) +L(fwd_write_30bytes): + movq -30(%eax), %xmm0 + movq %xmm0, -30(%edx) +L(fwd_write_22bytes): + movq -22(%eax), %xmm0 + movq %xmm0, -22(%edx) +L(fwd_write_14bytes): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_42bytes): + movq -42(%eax), %xmm0 + movq %xmm0, -42(%edx) +L(fwd_write_34bytes): + movq -34(%eax), %xmm0 + movq %xmm0, -34(%edx) +L(fwd_write_26bytes): + movq -26(%eax), %xmm0 + movq %xmm0, -26(%edx) +L(fwd_write_18bytes): + movq -18(%eax), %xmm0 + movq %xmm0, -18(%edx) +L(fwd_write_10bytes): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_47bytes): + movq -47(%eax), %xmm0 + movq %xmm0, -47(%edx) +L(fwd_write_39bytes): + movq -39(%eax), %xmm0 + movq %xmm0, -39(%edx) +L(fwd_write_31bytes): + movq -31(%eax), %xmm0 + movq %xmm0, -31(%edx) +L(fwd_write_23bytes): + movq -23(%eax), %xmm0 + movq %xmm0, -23(%edx) +L(fwd_write_15bytes): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_43bytes): + movq -43(%eax), %xmm0 + movq %xmm0, -43(%edx) +L(fwd_write_35bytes): + movq -35(%eax), %xmm0 + movq %xmm0, -35(%edx) +L(fwd_write_27bytes): + movq -27(%eax), %xmm0 + movq %xmm0, -27(%edx) +L(fwd_write_19bytes): + movq -19(%eax), %xmm0 + movq %xmm0, -19(%edx) +L(fwd_write_11bytes): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_40bytes_align): + movdqa -40(%eax), %xmm0 + movdqa %xmm0, -40(%edx) +L(fwd_write_24bytes_align): + movdqa -24(%eax), %xmm0 + movdqa %xmm0, -24(%edx) +L(fwd_write_8bytes_align): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes_align): +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_32bytes_align): + movdqa -32(%eax), %xmm0 + movdqa %xmm0, -32(%edx) +L(fwd_write_16bytes_align): + movdqa -16(%eax), %xmm0 + movdqa %xmm0, -16(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_5bytes_align): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_45bytes_align): + movdqa -45(%eax), %xmm0 + movdqa %xmm0, -45(%edx) +L(fwd_write_29bytes_align): + movdqa -29(%eax), %xmm0 + movdqa %xmm0, -29(%edx) +L(fwd_write_13bytes_align): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_37bytes_align): + movdqa -37(%eax), %xmm0 + movdqa %xmm0, -37(%edx) +L(fwd_write_21bytes_align): + movdqa -21(%eax), %xmm0 + movdqa %xmm0, -21(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_41bytes_align): + movdqa -41(%eax), %xmm0 + movdqa %xmm0, -41(%edx) +L(fwd_write_25bytes_align): + movdqa -25(%eax), %xmm0 + movdqa %xmm0, -25(%edx) +L(fwd_write_9bytes_align): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes_align): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_33bytes_align): + movdqa -33(%eax), %xmm0 + movdqa %xmm0, -33(%edx) +L(fwd_write_17bytes_align): + movdqa -17(%eax), %xmm0 + movdqa %xmm0, -17(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_46bytes_align): + movdqa -46(%eax), %xmm0 + movdqa %xmm0, -46(%edx) +L(fwd_write_30bytes_align): + movdqa -30(%eax), %xmm0 + movdqa %xmm0, -30(%edx) +L(fwd_write_14bytes_align): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes_align): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_38bytes_align): + movdqa -38(%eax), %xmm0 + movdqa %xmm0, -38(%edx) +L(fwd_write_22bytes_align): + movdqa -22(%eax), %xmm0 + movdqa %xmm0, -22(%edx) + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_42bytes_align): + movdqa -42(%eax), %xmm0 + movdqa %xmm0, -42(%edx) +L(fwd_write_26bytes_align): + movdqa -26(%eax), %xmm0 + movdqa %xmm0, -26(%edx) +L(fwd_write_10bytes_align): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes_align): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_34bytes_align): + movdqa -34(%eax), %xmm0 + movdqa %xmm0, -34(%edx) +L(fwd_write_18bytes_align): + movdqa -18(%eax), %xmm0 + movdqa %xmm0, -18(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_47bytes_align): + movdqa -47(%eax), %xmm0 + movdqa %xmm0, -47(%edx) +L(fwd_write_31bytes_align): + movdqa -31(%eax), %xmm0 + movdqa %xmm0, -31(%edx) +L(fwd_write_15bytes_align): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes_align): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_39bytes_align): + movdqa -39(%eax), %xmm0 + movdqa %xmm0, -39(%edx) +L(fwd_write_23bytes_align): + movdqa -23(%eax), %xmm0 + movdqa %xmm0, -23(%edx) + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_43bytes_align): + movdqa -43(%eax), %xmm0 + movdqa %xmm0, -43(%edx) +L(fwd_write_27bytes_align): + movdqa -27(%eax), %xmm0 + movdqa %xmm0, -27(%edx) +L(fwd_write_11bytes_align): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes_align): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_35bytes_align): + movdqa -35(%eax), %xmm0 + movdqa %xmm0, -35(%edx) +L(fwd_write_19bytes_align): + movdqa -19(%eax), %xmm0 + movdqa %xmm0, -19(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_44bytes_align): + movdqa -44(%eax), %xmm0 + movdqa %xmm0, -44(%edx) +L(fwd_write_28bytes_align): + movdqa -28(%eax), %xmm0 + movdqa %xmm0, -28(%edx) +L(fwd_write_12bytes_align): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes_align): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_36bytes_align): + movdqa -36(%eax), %xmm0 + movdqa %xmm0, -36(%edx) +L(fwd_write_20bytes_align): + movdqa -20(%eax), %xmm0 + movdqa %xmm0, -20(%edx) + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN_END + + CFI_PUSH (%edi) + + .p2align 4 +L(large_page): + movdqu (%eax), %xmm1 +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif + lea 16(%eax), %eax + movntdq %xmm1, (%edx) + lea 16(%edx), %edx + lea -0x90(%ecx), %ecx + POP (%edi) + + .p2align 4 +L(large_page_loop): + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jb L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(bk_write_44bytes): + movq 36(%eax), %xmm0 + movq %xmm0, 36(%edx) +L(bk_write_36bytes): + movq 28(%eax), %xmm0 + movq %xmm0, 28(%edx) +L(bk_write_28bytes): + movq 20(%eax), %xmm0 + movq %xmm0, 20(%edx) +L(bk_write_20bytes): + movq 12(%eax), %xmm0 + movq %xmm0, 12(%edx) +L(bk_write_12bytes): + movq 4(%eax), %xmm0 + movq %xmm0, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_40bytes): + movq 32(%eax), %xmm0 + movq %xmm0, 32(%edx) +L(bk_write_32bytes): + movq 24(%eax), %xmm0 + movq %xmm0, 24(%edx) +L(bk_write_24bytes): + movq 16(%eax), %xmm0 + movq %xmm0, 16(%edx) +L(bk_write_16bytes): + movq 8(%eax), %xmm0 + movq %xmm0, 8(%edx) +L(bk_write_8bytes): + movq (%eax), %xmm0 + movq %xmm0, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_45bytes): + movq 37(%eax), %xmm0 + movq %xmm0, 37(%edx) +L(bk_write_37bytes): + movq 29(%eax), %xmm0 + movq %xmm0, 29(%edx) +L(bk_write_29bytes): + movq 21(%eax), %xmm0 + movq %xmm0, 21(%edx) +L(bk_write_21bytes): + movq 13(%eax), %xmm0 + movq %xmm0, 13(%edx) +L(bk_write_13bytes): + movq 5(%eax), %xmm0 + movq %xmm0, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_41bytes): + movq 33(%eax), %xmm0 + movq %xmm0, 33(%edx) +L(bk_write_33bytes): + movq 25(%eax), %xmm0 + movq %xmm0, 25(%edx) +L(bk_write_25bytes): + movq 17(%eax), %xmm0 + movq %xmm0, 17(%edx) +L(bk_write_17bytes): + movq 9(%eax), %xmm0 + movq %xmm0, 9(%edx) +L(bk_write_9bytes): + movq 1(%eax), %xmm0 + movq %xmm0, 1(%edx) + movzbl (%eax), %ecx + movb %cl, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_46bytes): + movq 38(%eax), %xmm0 + movq %xmm0, 38(%edx) +L(bk_write_38bytes): + movq 30(%eax), %xmm0 + movq %xmm0, 30(%edx) +L(bk_write_30bytes): + movq 22(%eax), %xmm0 + movq %xmm0, 22(%edx) +L(bk_write_22bytes): + movq 14(%eax), %xmm0 + movq %xmm0, 14(%edx) +L(bk_write_14bytes): + movq 6(%eax), %xmm0 + movq %xmm0, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) + movzwl (%eax), %ecx + movw %cx, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_42bytes): + movq 34(%eax), %xmm0 + movq %xmm0, 34(%edx) +L(bk_write_34bytes): + movq 26(%eax), %xmm0 + movq %xmm0, 26(%edx) +L(bk_write_26bytes): + movq 18(%eax), %xmm0 + movq %xmm0, 18(%edx) +L(bk_write_18bytes): + movq 10(%eax), %xmm0 + movq %xmm0, 10(%edx) +L(bk_write_10bytes): + movq 2(%eax), %xmm0 + movq %xmm0, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_47bytes): + movq 39(%eax), %xmm0 + movq %xmm0, 39(%edx) +L(bk_write_39bytes): + movq 31(%eax), %xmm0 + movq %xmm0, 31(%edx) +L(bk_write_31bytes): + movq 23(%eax), %xmm0 + movq %xmm0, 23(%edx) +L(bk_write_23bytes): + movq 15(%eax), %xmm0 + movq %xmm0, 15(%edx) +L(bk_write_15bytes): + movq 7(%eax), %xmm0 + movq %xmm0, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_43bytes): + movq 35(%eax), %xmm0 + movq %xmm0, 35(%edx) +L(bk_write_35bytes): + movq 27(%eax), %xmm0 + movq %xmm0, 27(%edx) +L(bk_write_27bytes): + movq 19(%eax), %xmm0 + movq %xmm0, 19(%edx) +L(bk_write_19bytes): + movq 11(%eax), %xmm0 + movq %xmm0, 11(%edx) +L(bk_write_11bytes): + movq 3(%eax), %xmm0 + movq %xmm0, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + .p2align 2 +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + .p2align 2 +L(table_48bytes_fwd_align): + .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) + + .p2align 2 +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + .p2align 2 +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +# ifdef USE_AS_MEMMOVE + .p2align 4 +L(copy_backward): + PUSH (%edi) + movl %eax, %edi + lea (%ecx,%edx,1),%edx + lea (%ecx,%edi,1),%edi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jae L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jb L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movq -8(%edi), %xmm0 + movq %xmm0, -8(%edx) + movq -16(%edi), %xmm0 + movq %xmm0, -16(%edx) + movq -24(%edi), %xmm0 + movq %xmm0, -24(%edx) + movq -32(%edi), %xmm0 + movq %xmm0, -32(%edx) + sub $32, %edx + sub $32, %edi + +L(bk_write_less32bytes): + movl %edi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%edi) +L(bk_write_less32bytes_2): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(bk_align): + cmp $8, %ecx + jbe L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %edi + sub $1, %ecx + sub $1, %edx + movzbl (%edi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %edi + sub $2, %ecx + sub $2, %edx + movzwl (%edi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + .p2align 4 +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jb L(bk_write_more32bytes) + + .p2align 4 +L(bk_ssse3_cpy): + sub $64, %edi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%edi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%edi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%edi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%edi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jae L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +# endif + +END (MEMCPY) + +#endif diff --git a/sysdeps/i386/multiarch/memcpy.c b/sysdeps/i386/multiarch/memcpy.c new file mode 100644 index 0000000000..68401c754a --- /dev/null +++ b/sysdeps/i386/multiarch/memcpy.c @@ -0,0 +1,69 @@ +/* Multiple versions of memcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need memcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) +/* Redefine memcpy so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memcpy +# define memcpy __redirect_memcpy +# include +# undef memcpy + +# include + +extern __typeof (__redirect_memcpy) __memcpy_i386 attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_i586 attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_i686 attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_sse2_unaligned attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_ssse3 attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_ssse3_rep attribute_hidden; + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_memcpy) memcpy; +extern void *memcpy_ifunc (void) __asm__ ("memcpy"); + +void * +memcpy_ifunc (void) +{ + if (HAS_CPU_FEATURE (SSE2)) + { + if (HAS_ARCH_FEATURE (Fast_Unaligned_Load)) + return __memcpy_sse2_unaligned; + else if (HAS_CPU_FEATURE (SSSE3)) + { + if (HAS_ARCH_FEATURE (Fast_Rep_String)) + return __memcpy_ssse3_rep; + else + return __memcpy_ssse3; + } + } + + if (USE_I686) + return __memcpy_i686; + else if (USE_I586) + return __memcpy_i586; + else + return __memcpy_i386; +} +__asm__ (".type memcpy, %gnu_indirect_function"); +#endif diff --git a/sysdeps/i386/multiarch/memcpy_chk.c b/sysdeps/i386/multiarch/memcpy_chk.c new file mode 100644 index 0000000000..aad6f87a45 --- /dev/null +++ b/sysdeps/i386/multiarch/memcpy_chk.c @@ -0,0 +1,70 @@ +/* Multiple versions of __memcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch memcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED +# include + +extern void * __memcpy_chk (void *, const void *, size_t, size_t); + +extern __typeof (__memcpy_chk) __memcpy_chk_i386 attribute_hidden; +extern __typeof (__memcpy_chk) __memcpy_chk_i586 attribute_hidden; +extern __typeof (__memcpy_chk) __memcpy_chk_i686 attribute_hidden; +extern __typeof (__memcpy_chk) __memcpy_chk_sse2_unaligned attribute_hidden; +extern __typeof (__memcpy_chk) __memcpy_chk_ssse3 attribute_hidden; +extern __typeof (__memcpy_chk) __memcpy_chk_ssse3_rep attribute_hidden; + +# include + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern void *memcpy_chk_ifunc (void) __asm__ ("__memcpy_chk"); + +void * +memcpy_chk_ifunc (void) +{ + if (HAS_CPU_FEATURE (SSE2)) + { + if (HAS_ARCH_FEATURE (Fast_Unaligned_Load)) + return __memcpy_chk_sse2_unaligned; + else if (HAS_CPU_FEATURE (SSSE3)) + { + if (HAS_ARCH_FEATURE (Fast_Rep_String)) + return __memcpy_chk_ssse3_rep; + else + return __memcpy_chk_ssse3; + } + } + + if (USE_I686) + return __memcpy_chk_i686; + else if (USE_I586) + return __memcpy_chk_i586; + else + return __memcpy_chk_i386; +} +__asm__ (".type __memcpy_chk, %gnu_indirect_function"); +# else +# include +# endif +#endif diff --git a/sysdeps/i386/multiarch/memmove-i386.S b/sysdeps/i386/multiarch/memmove-i386.S new file mode 100644 index 0000000000..af21d2d305 --- /dev/null +++ b/sysdeps/i386/multiarch/memmove-i386.S @@ -0,0 +1,11 @@ +#ifdef SHARED +# define memmove __memmove_i386 +# define __memmove_chk __memmove_chk_i386 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# include + + .globl __GI_memmove + .hidden __GI_memmove + __GI_memmove = __memmove_i386 +#endif diff --git a/sysdeps/i386/multiarch/memmove-i686.S b/sysdeps/i386/multiarch/memmove-i686.S new file mode 100644 index 0000000000..bde4051b49 --- /dev/null +++ b/sysdeps/i386/multiarch/memmove-i686.S @@ -0,0 +1,7 @@ +#ifdef SHARED +# define memmove __memmove_i686 +# define __memmove_chk __memmove_chk_i686 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# include +#endif diff --git a/sysdeps/i386/multiarch/memmove-sse2-unaligned.S b/sysdeps/i386/multiarch/memmove-sse2-unaligned.S new file mode 100644 index 0000000000..3873594cb2 --- /dev/null +++ b/sysdeps/i386/multiarch/memmove-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_sse2_unaligned +#define MEMCPY_CHK __memmove_chk_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/sysdeps/i386/multiarch/memmove-ssse3-rep.S b/sysdeps/i386/multiarch/memmove-ssse3-rep.S new file mode 100644 index 0000000000..d202fc4a13 --- /dev/null +++ b/sysdeps/i386/multiarch/memmove-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3_rep +#define MEMCPY_CHK __memmove_chk_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/sysdeps/i386/multiarch/memmove-ssse3.S b/sysdeps/i386/multiarch/memmove-ssse3.S new file mode 100644 index 0000000000..295430b1ef --- /dev/null +++ b/sysdeps/i386/multiarch/memmove-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3 +#define MEMCPY_CHK __memmove_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/sysdeps/i386/multiarch/memmove.c b/sysdeps/i386/multiarch/memmove.c new file mode 100644 index 0000000000..e47d15ee5e --- /dev/null +++ b/sysdeps/i386/multiarch/memmove.c @@ -0,0 +1,66 @@ +/* Multiple versions of memmove. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need memmove before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) +/* Redefine memmove so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memmove +# define memmove __redirect_memmove +# include +# undef memmove + +# include + +extern __typeof (__redirect_memmove) __memmove_i386 attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_i686 attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_sse2_unaligned attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_ssse3_rep attribute_hidden; + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_memmove) memmove; +extern void *memmove_ifunc (void) __asm__ ("memmove"); + +void * +memmove_ifunc (void) +{ + if (HAS_CPU_FEATURE (SSE2)) + { + if (HAS_ARCH_FEATURE (Fast_Unaligned_Load)) + return __memmove_sse2_unaligned; + else if (HAS_CPU_FEATURE (SSSE3)) + { + if (HAS_ARCH_FEATURE (Fast_Rep_String)) + return __memmove_ssse3_rep; + else + return __memmove_ssse3; + } + } + + if (USE_I686) + return __memmove_i686; + else + return __memmove_i386; +} +__asm__ (".type memmove, %gnu_indirect_function"); +#endif diff --git a/sysdeps/i386/multiarch/memmove_chk.c b/sysdeps/i386/multiarch/memmove_chk.c new file mode 100644 index 0000000000..d34c19ab67 --- /dev/null +++ b/sysdeps/i386/multiarch/memmove_chk.c @@ -0,0 +1,105 @@ +/* Multiple versions of __memmove_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) +# include + +extern void * __memmove_chk (void *, const void *, size_t, size_t); + +# ifdef SHARED +extern __typeof (__memmove_chk) __memmove_chk_i386 attribute_hidden; +extern __typeof (__memmove_chk) __memmove_chk_i686 attribute_hidden; +extern __typeof (__memmove_chk) __memmove_chk_sse2_unaligned attribute_hidden; +extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden; +extern __typeof (__memmove_chk) __memmove_chk_ssse3_rep attribute_hidden; + +# include +# else +/* Redefine memmove so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memmove +# define memmove __redirect_memmove +# include +# undef memmove + +extern __typeof (__redirect_memmove) __memmove_i386 attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_i686 attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_sse2_unaligned attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden; +extern __typeof (__redirect_memmove) __memmove_ssse3_rep attribute_hidden; + +/* Due to + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=10837 + noreturn attribute disable tail call optimization. Removes noreturn + attribute to enable tail call optimization. */ +extern void *chk_fail (void) __asm__ ("__chk_fail") attribute_hidden; + +# include + +#define ifunc_chk(func, arch) \ +static void * \ +func##_chk_##arch (void *dstpp, const void *srcpp, size_t len, \ + size_t dstlen) \ +{ \ + if (__glibc_unlikely (dstlen < len)) \ + return chk_fail (); \ + return func##_##arch (dstpp, srcpp, len); \ +} + +# if MINIMUM_ISA < 686 +ifunc_chk (__memmove, i386) +# else +extern __typeof (__memmove_chk) __memmove_chk_i386 attribute_hidden; +# endif + +ifunc_chk (__memmove, i686) +ifunc_chk (__memmove, sse2_unaligned) +ifunc_chk (__memmove, ssse3) +ifunc_chk (__memmove, ssse3_rep) +# endif + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern void *memmove_chk_ifunc (void) __asm__ ("__memmove_chk"); + +void * +memmove_chk_ifunc (void) +{ + if (HAS_CPU_FEATURE (SSE2)) + { + if (HAS_ARCH_FEATURE (Fast_Unaligned_Load)) + return __memmove_chk_sse2_unaligned; + else if (HAS_CPU_FEATURE (SSSE3)) + { + if (HAS_ARCH_FEATURE (Fast_Rep_String)) + return __memmove_chk_ssse3_rep; + else + return __memmove_chk_ssse3; + } + } + + if (USE_I686) + return __memmove_chk_i686; + else + return __memmove_chk_i386; +} +__asm__ (".type __memmove_chk, %gnu_indirect_function"); +#endif diff --git a/sysdeps/i386/multiarch/mempcpy-i386.S b/sysdeps/i386/multiarch/mempcpy-i386.S new file mode 100644 index 0000000000..39f72b6a1c --- /dev/null +++ b/sysdeps/i386/multiarch/mempcpy-i386.S @@ -0,0 +1,18 @@ +#ifdef SHARED +# define __mempcpy __mempcpy_i386 +# define __mempcpy_chk __mempcpy_chk_i386 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# undef libc_hidden_def +# define libc_hidden_def(name) +# undef weak_alias +# define weak_alias(name, aliasname) +# include + + .globl __GI_mempcpy + .hidden __GI_mempcpy + __GI_mempcpy = __mempcpy_i386 + .globl __GI___mempcpy + .hidden __GI___mempcpy + __GI___mempcpy = __mempcpy_i386 +#endif diff --git a/sysdeps/i386/multiarch/mempcpy-i586.S b/sysdeps/i386/multiarch/mempcpy-i586.S new file mode 100644 index 0000000000..cb0d241a60 --- /dev/null +++ b/sysdeps/i386/multiarch/mempcpy-i586.S @@ -0,0 +1,11 @@ +#ifdef SHARED +# define __mempcpy __mempcpy_i586 +# define __mempcpy_chk __mempcpy_chk_i586 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# undef libc_hidden_def +# define libc_hidden_def(name) +# undef weak_alias +# define weak_alias(name, aliasname) +# include +#endif diff --git a/sysdeps/i386/multiarch/mempcpy-i686.S b/sysdeps/i386/multiarch/mempcpy-i686.S new file mode 100644 index 0000000000..1b5fba6c1a --- /dev/null +++ b/sysdeps/i386/multiarch/mempcpy-i686.S @@ -0,0 +1,11 @@ +#ifdef SHARED +# define __mempcpy __mempcpy_i686 +# define __mempcpy_chk __mempcpy_chk_i686 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) +# undef libc_hidden_def +# define libc_hidden_def(name) +# undef weak_alias +# define weak_alias(name, aliasname) +# include +#endif diff --git a/sysdeps/i386/multiarch/mempcpy-sse2-unaligned.S b/sysdeps/i386/multiarch/mempcpy-sse2-unaligned.S new file mode 100644 index 0000000000..a1cea50771 --- /dev/null +++ b/sysdeps/i386/multiarch/mempcpy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_sse2_unaligned +#define MEMCPY_CHK __mempcpy_chk_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/sysdeps/i386/multiarch/mempcpy-ssse3-rep.S b/sysdeps/i386/multiarch/mempcpy-ssse3-rep.S new file mode 100644 index 0000000000..5357b33e18 --- /dev/null +++ b/sysdeps/i386/multiarch/mempcpy-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_ssse3_rep +#define MEMCPY_CHK __mempcpy_chk_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/sysdeps/i386/multiarch/mempcpy-ssse3.S b/sysdeps/i386/multiarch/mempcpy-ssse3.S new file mode 100644 index 0000000000..822d98e954 --- /dev/null +++ b/sysdeps/i386/multiarch/mempcpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_ssse3 +#define MEMCPY_CHK __mempcpy_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/sysdeps/i386/multiarch/mempcpy.c b/sysdeps/i386/multiarch/mempcpy.c new file mode 100644 index 0000000000..a267477d20 --- /dev/null +++ b/sysdeps/i386/multiarch/mempcpy.c @@ -0,0 +1,71 @@ +/* Multiple versions of mempcpy. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need mempcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) +# define _HAVE_STRING_ARCH_mempcpy +/* Redefine mempcpy so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef mempcpy +# define mempcpy __redirect_mempcpy +# include +# undef mempcpy + +# include + +extern __typeof (__redirect_mempcpy) __mempcpy_i386 attribute_hidden; +extern __typeof (__redirect_mempcpy) __mempcpy_i586 attribute_hidden; +extern __typeof (__redirect_mempcpy) __mempcpy_i686 attribute_hidden; +extern __typeof (__redirect_mempcpy) __mempcpy_sse2_unaligned attribute_hidden; +extern __typeof (__redirect_mempcpy) __mempcpy_ssse3 attribute_hidden; +extern __typeof (__redirect_mempcpy) __mempcpy_ssse3_rep attribute_hidden; + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern __typeof (__redirect_mempcpy) __mempcpy; +extern void *mempcpy_ifunc (void) __asm__ ("__mempcpy"); + +void * +mempcpy_ifunc (void) +{ + if (HAS_CPU_FEATURE (SSE2)) + { + if (HAS_ARCH_FEATURE (Fast_Unaligned_Load)) + return __mempcpy_sse2_unaligned; + else if (HAS_CPU_FEATURE (SSSE3)) + { + if (HAS_ARCH_FEATURE (Fast_Rep_String)) + return __mempcpy_ssse3_rep; + else + return __mempcpy_ssse3; + } + } + + if (USE_I686) + return __mempcpy_i686; + else if (USE_I586) + return __mempcpy_i586; + else + return __mempcpy_i386; +} +__asm__ (".type __mempcpy, %gnu_indirect_function"); +weak_alias (__mempcpy, mempcpy) +#endif diff --git a/sysdeps/i386/multiarch/mempcpy_chk.c b/sysdeps/i386/multiarch/mempcpy_chk.c new file mode 100644 index 0000000000..b1e1fd6b8d --- /dev/null +++ b/sysdeps/i386/multiarch/mempcpy_chk.c @@ -0,0 +1,70 @@ +/* Multiple versions of __mempcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2015 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch mempcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED +# include + +extern void * __mempcpy_chk (void *, const void *, size_t, size_t); + +extern __typeof (__mempcpy_chk) __mempcpy_chk_i386 attribute_hidden; +extern __typeof (__mempcpy_chk) __mempcpy_chk_i586 attribute_hidden; +extern __typeof (__mempcpy_chk) __mempcpy_chk_i686 attribute_hidden; +extern __typeof (__mempcpy_chk) __mempcpy_chk_sse2_unaligned attribute_hidden; +extern __typeof (__mempcpy_chk) __mempcpy_chk_ssse3 attribute_hidden; +extern __typeof (__mempcpy_chk) __mempcpy_chk_ssse3_rep attribute_hidden; + +# include + +/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ +extern void *mempcpy_chk_ifunc (void) __asm__ ("__mempcpy_chk"); + +void * +mempcpy_chk_ifunc (void) +{ + if (HAS_CPU_FEATURE (SSE2)) + { + if (HAS_ARCH_FEATURE (Fast_Unaligned_Load)) + return __mempcpy_chk_sse2_unaligned; + else if (HAS_CPU_FEATURE (SSSE3)) + { + if (HAS_ARCH_FEATURE (Fast_Rep_String)) + return __mempcpy_chk_ssse3_rep; + else + return __mempcpy_chk_ssse3; + } + } + + if (USE_I686) + return __mempcpy_chk_i686; + else if (USE_I586) + return __mempcpy_chk_i586; + else + return __mempcpy_chk_i386; +} +__asm__ (".type __mempcpy_chk, %gnu_indirect_function"); +# else +# include +# endif +#endif diff --git a/sysdeps/i386/multiarch/rtld-memmove.S b/sysdeps/i386/multiarch/rtld-memmove.S new file mode 100644 index 0000000000..d1312ecf3c --- /dev/null +++ b/sysdeps/i386/multiarch/rtld-memmove.S @@ -0,0 +1,19 @@ +/* memmove for ld.so + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include diff --git a/sysdeps/i386/multiarch/static-memcpy.S b/sysdeps/i386/multiarch/static-memcpy.S new file mode 100644 index 0000000000..b7d6b2aea9 --- /dev/null +++ b/sysdeps/i386/multiarch/static-memcpy.S @@ -0,0 +1,21 @@ +/* memcpy for libc.a + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if !defined SHARED && IS_IN (libc) +# include +#endif diff --git a/sysdeps/i386/multiarch/static-memmove.S b/sysdeps/i386/multiarch/static-memmove.S new file mode 100644 index 0000000000..086f394b29 --- /dev/null +++ b/sysdeps/i386/multiarch/static-memmove.S @@ -0,0 +1,21 @@ +/* memmove for libc.a + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if !defined SHARED && IS_IN (libc) +# include +#endif diff --git a/sysdeps/i386/multiarch/static-mempcpy.S b/sysdeps/i386/multiarch/static-mempcpy.S new file mode 100644 index 0000000000..27c035c003 --- /dev/null +++ b/sysdeps/i386/multiarch/static-mempcpy.S @@ -0,0 +1,21 @@ +/* memcpy for libc.a + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#if !defined SHARED && IS_IN (libc) +# include +#endif -- cgit 1.4.1