diff options
Diffstat (limited to 'REORG.TODO/sysdeps/i386')
450 files changed, 62011 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/i386/Implies b/REORG.TODO/sysdeps/i386/Implies new file mode 100644 index 0000000000..20b2dffc29 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/Implies @@ -0,0 +1,5 @@ +x86 +wordsize-32 +ieee754/ldbl-96 +ieee754/dbl-64 +ieee754/flt-32 diff --git a/REORG.TODO/sysdeps/i386/Makefile b/REORG.TODO/sysdeps/i386/Makefile new file mode 100644 index 0000000000..e30e1339f0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/Makefile @@ -0,0 +1,103 @@ +# The mpn functions need a #define for asm syntax flavor. +# Every i386 port in use uses gas syntax (I think). +asm-CPPFLAGS += -DGAS_SYNTAX + +# The i386 `long double' is a distinct type we support. +long-double-fcts = yes + +ifeq ($(subdir),string) +sysdep_routines += cacheinfo +endif + +ifeq ($(subdir),gmon) +sysdep_routines += i386-mcount +endif + +ifeq ($(subdir),elf) +CFLAGS-rtld.c += -Wno-uninitialized -Wno-unused +CFLAGS-dl-load.c += -Wno-unused +CFLAGS-dl-reloc.c += -Wno-unused +endif + +ifeq ($(subdir),debug) +CFLAGS-backtrace.c += -fexceptions +endif + +# Most of the glibc routines don't ever call user defined callbacks +# nor use any FPU or SSE* and as such don't need bigger %esp alignment +# than 4 bytes. +# Lots of routines in math will use FPU, so make math subdir an exception +# here. +# In gcc 4.6 (and maybe earlier?) giving -mpreferred-stack-boundary=2 is +# an error, so don't try to reduce it here like we used to. We still +# explicit set -mpreferred-stack-boundary=4 the places where it matters, +# in case an older compiler defaulted to 2. +ifeq ($(subdir),math) +sysdep-CFLAGS += -mpreferred-stack-boundary=4 +else +ifeq ($(subdir),csu) +sysdep-CFLAGS += -mpreferred-stack-boundary=4 +gen-as-const-headers += link-defines.sym +else +# Likewise, any function which calls user callbacks +uses-callbacks += -mpreferred-stack-boundary=4 +# Likewise, any stack alignment tests +stack-align-test-flags += -malign-double -mpreferred-stack-boundary=4 +endif +endif + +# And a couple of other routines +ifeq ($(subdir),stdlib) +CFLAGS-exit.c += -mpreferred-stack-boundary=4 +CFLAGS-cxa_finalize.c += -mpreferred-stack-boundary=4 +endif +ifeq ($(subdir),elf) +CFLAGS-dl-init.c += -mpreferred-stack-boundary=4 +CFLAGS-dl-fini.c += -mpreferred-stack-boundary=4 +CFLAGS-dl-open.c += -mpreferred-stack-boundary=4 +CFLAGS-dl-close.c += -mpreferred-stack-boundary=4 +CFLAGS-dl-error.c += -mpreferred-stack-boundary=4 +endif +ifeq ($(subdir),dlfcn) +CFLAGS-dlopen.c += -mpreferred-stack-boundary=4 +CFLAGS-dlopenold.c += -mpreferred-stack-boundary=4 +CFLAGS-dlclose.c += -mpreferred-stack-boundary=4 +CFLAGS-dlerror.c += -mpreferred-stack-boundary=4 +endif + +ifneq (,$(filter -mno-tls-direct-seg-refs,$(CFLAGS))) +defines += -DNO_TLS_DIRECT_SEG_REFS +endif + +ifeq ($(subdir),elf) +sysdep-dl-routines += tlsdesc dl-tlsdesc + +tests += tst-audit3 +modules-names += tst-auditmod3a tst-auditmod3b + +$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so +$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so +tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so +endif + +ifeq ($(subdir),csu) +gen-as-const-headers += tlsdesc.sym +endif + +# Make sure no code in ld.so uses mm/xmm/ymm/zmm registers on i386 since +# the first 3 mm/xmm/ymm/zmm registers are used to pass vector parameters +# which must be preserved. +# With SSE disabled, ensure -fpmath is not set to use sse either. +rtld-CFLAGS += -mno-sse -mno-mmx -mfpmath=387 +ifeq ($(subdir),elf) +CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\ + $(rtld-CFLAGS)) + +tests-special += $(objpfx)tst-ld-sse-use.out +$(objpfx)tst-ld-sse-use.out: ../sysdeps/i386/tst-ld-sse-use.sh $(objpfx)ld.so + @echo "Checking ld.so for SSE register use. This will take a few seconds..." + $(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \ + $(evaluate-test) +else +CFLAGS-.os += $(if $(filter rtld-%.os,$(@F)), $(rtld-CFLAGS)) +endif diff --git a/REORG.TODO/sysdeps/i386/Versions b/REORG.TODO/sysdeps/i386/Versions new file mode 100644 index 0000000000..7be44aad7a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/Versions @@ -0,0 +1,35 @@ +ld { + GLIBC_2.3 { + # The alternative i386 runtime interface to TLS. + ___tls_get_addr; + } +} +libc { + GLIBC_2.0 { + # Functions from libgcc. + __divdi3; __moddi3; __udivdi3; __umoddi3; + } + GLIBC_2.1 { + # global variable + _fp_hw; + } + GLIBC_2.1.1 { + # extern inline functions used by <bits/string.h> + __memcpy_c; __memset_cc; __memset_cg; __memset_gg; + __memcpy_by2; __memcpy_by4; __memcpy_g; __mempcpy_by2; __mempcpy_by4; + __mempcpy_byn; __memset_ccn_by2; __memset_ccn_by4; __memset_gcn_by2; + __memset_gcn_by4; __stpcpy_g; __strcat_c; __strcat_g; __strchr_c; + __strchr_g; __strchrnul_c; __strchrnul_g; __strcmp_gg; __strcpy_g; + __strcspn_c1; __strcspn_cg; __strcspn_g; __strlen_g; __strncat_g; + __strncmp_g; __strncpy_by2; __strncpy_by4; __strncpy_byn; __strncpy_gg; + __strpbrk_cg; __strpbrk_g; __strrchr_c; __strrchr_g; __strspn_c1; + __strspn_cg; __strspn_g; __strstr_cg; __strstr_g; + } +} +libm { + GLIBC_2.1 { + # A generic bug got this omitted from other configurations' version + # sets, but we always had it. + exp2l; + } +} diff --git a/REORG.TODO/sysdeps/i386/____longjmp_chk.S b/REORG.TODO/sysdeps/i386/____longjmp_chk.S new file mode 100644 index 0000000000..0910861a9d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/____longjmp_chk.S @@ -0,0 +1 @@ +#error "OS-specific version needed" diff --git a/REORG.TODO/sysdeps/i386/__longjmp.S b/REORG.TODO/sysdeps/i386/__longjmp.S new file mode 100644 index 0000000000..3719763cd6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/__longjmp.S @@ -0,0 +1,72 @@ +/* longjmp for i386. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <jmpbuf-offsets.h> +#include <asm-syntax.h> +#include <stap-probe.h> + + .text +ENTRY (__longjmp) +#ifdef PTR_DEMANGLE + movl 4(%esp), %eax /* User's jmp_buf in %eax. */ + + /* Save the return address now. */ + movl (JB_PC*4)(%eax), %edx + /* Get the stack pointer. */ + movl (JB_SP*4)(%eax), %ecx + PTR_DEMANGLE (%edx) + PTR_DEMANGLE (%ecx) + LIBC_PROBE (longjmp, 3, 4@%eax, -4@8(%esp), 4@%edx) + cfi_def_cfa(%eax, 0) + cfi_register(%eip, %edx) + cfi_register(%esp, %ecx) + cfi_offset(%ebx, JB_BX*4) + cfi_offset(%esi, JB_SI*4) + cfi_offset(%edi, JB_DI*4) + cfi_offset(%ebp, JB_BP*4) + /* Restore registers. */ + movl (JB_BX*4)(%eax), %ebx + movl (JB_SI*4)(%eax), %esi + movl (JB_DI*4)(%eax), %edi + movl (JB_BP*4)(%eax), %ebp + cfi_restore(%ebx) + cfi_restore(%esi) + cfi_restore(%edi) + cfi_restore(%ebp) + + LIBC_PROBE (longjmp_target, 3, 4@%eax, -4@8(%esp), 4@%edx) + movl 8(%esp), %eax /* Second argument is return value. */ + movl %ecx, %esp +#else + movl 4(%esp), %ecx /* User's jmp_buf in %ecx. */ + movl 8(%esp), %eax /* Second argument is return value. */ + /* Save the return address now. */ + movl (JB_PC*4)(%ecx), %edx + LIBC_PROBE (longjmp, 3, 4@%ecx, -4@%eax, 4@%edx) + /* Restore registers. */ + movl (JB_BX*4)(%ecx), %ebx + movl (JB_SI*4)(%ecx), %esi + movl (JB_DI*4)(%ecx), %edi + movl (JB_BP*4)(%ecx), %ebp + movl (JB_SP*4)(%ecx), %esp + LIBC_PROBE (longjmp_target, 3, 4@%ecx, -4@%ecx, 4@%edx) +#endif + /* Jump to saved PC. */ + jmp *%edx +END (__longjmp) diff --git a/REORG.TODO/sysdeps/i386/abort-instr.h b/REORG.TODO/sysdeps/i386/abort-instr.h new file mode 100644 index 0000000000..810f10379b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/abort-instr.h @@ -0,0 +1,2 @@ +/* An instruction which should crash any program is `hlt'. */ +#define ABORT_INSTRUCTION asm ("hlt") diff --git a/REORG.TODO/sysdeps/i386/add_n.S b/REORG.TODO/sysdeps/i386/add_n.S new file mode 100644 index 0000000000..c2923094a8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/add_n.S @@ -0,0 +1,111 @@ +/* Add two limb vectors of the same length > 0 and store sum in a third + limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+8 /* space for 2 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define S2 S1+4 +#define SIZE S2+4 + + .text +ENTRY (__mpn_add_n) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 4) + movl S1(%esp),%esi + cfi_rel_offset (esi, 0) + movl S2(%esp),%edx + movl SIZE(%esp),%ecx + movl %ecx,%eax + shrl $3,%ecx /* compute count for unrolled loop */ + negl %eax + andl $7,%eax /* get index where to start loop */ + jz L(oop) /* necessary special case for 0 */ + incl %ecx /* adjust loop count */ + shll $2,%eax /* adjustment for pointers... */ + subl %eax,%edi /* ... since they are offset ... */ + subl %eax,%esi /* ... by a constant when we ... */ + subl %eax,%edx /* ... enter the loop */ + shrl $2,%eax /* restore previous value */ +#ifdef PIC +/* Calculate start address in loop for PIC. Due to limitations in some + assemblers, Loop-L0-3 cannot be put into the leal */ + call L(0) + cfi_adjust_cfa_offset (4) +L(0): leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $(L(oop)-L(0)-3),%eax + addl $4,%esp + cfi_adjust_cfa_offset (-4) +#else +/* Calculate start address in loop for non-PIC. */ + leal (L(oop) - 3)(%eax,%eax,8),%eax +#endif + jmp *%eax /* jump into loop */ + ALIGN (3) +L(oop): movl (%esi),%eax + adcl (%edx),%eax + movl %eax,(%edi) + movl 4(%esi),%eax + adcl 4(%edx),%eax + movl %eax,4(%edi) + movl 8(%esi),%eax + adcl 8(%edx),%eax + movl %eax,8(%edi) + movl 12(%esi),%eax + adcl 12(%edx),%eax + movl %eax,12(%edi) + movl 16(%esi),%eax + adcl 16(%edx),%eax + movl %eax,16(%edi) + movl 20(%esi),%eax + adcl 20(%edx),%eax + movl %eax,20(%edi) + movl 24(%esi),%eax + adcl 24(%edx),%eax + movl %eax,24(%edi) + movl 28(%esi),%eax + adcl 28(%edx),%eax + movl %eax,28(%edi) + leal 32(%edi),%edi + leal 32(%esi),%esi + leal 32(%edx),%edx + decl %ecx + jnz L(oop) + + sbbl %eax,%eax + negl %eax + + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_add_n) diff --git a/REORG.TODO/sysdeps/i386/addmul_1.S b/REORG.TODO/sysdeps/i386/addmul_1.S new file mode 100644 index 0000000000..ad90ea53e5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/addmul_1.S @@ -0,0 +1,86 @@ +/* i80386 __mpn_addmul_1 -- Multiply a limb vector with a limb and add + the result to a second limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define SIZE S1+4 +#define S2LIMB SIZE+4 + +#define res_ptr edi +#define s1_ptr esi +#define sizeP ecx +#define s2_limb ebx + + .text +ENTRY (__mpn_addmul_1) + + pushl %res_ptr + cfi_adjust_cfa_offset (4) + pushl %s1_ptr + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %s2_limb + cfi_adjust_cfa_offset (4) + + movl RES(%esp), %res_ptr + cfi_rel_offset (res_ptr, 12) + movl S1(%esp), %s1_ptr + cfi_rel_offset (s1_ptr, 8) + movl SIZE(%esp), %sizeP + movl S2LIMB(%esp), %s2_limb + cfi_rel_offset (s2_limb, 0) + leal (%res_ptr,%sizeP,4), %res_ptr + leal (%s1_ptr,%sizeP,4), %s1_ptr + negl %sizeP + xorl %ebp, %ebp + cfi_rel_offset (ebp, 4) + ALIGN (3) +L(oop): + movl (%s1_ptr,%sizeP,4), %eax + mull %s2_limb + addl %ebp, %eax + adcl $0, %edx + addl %eax, (%res_ptr,%sizeP,4) + adcl $0, %edx + movl %edx, %ebp + + incl %sizeP + jnz L(oop) + movl %ebp, %eax + + popl %s2_limb + cfi_adjust_cfa_offset (-4) + cfi_restore (s2_limb) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %s1_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (s1_ptr) + popl %res_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (res_ptr) + + ret +END (__mpn_addmul_1) diff --git a/REORG.TODO/sysdeps/i386/asm-syntax.h b/REORG.TODO/sysdeps/i386/asm-syntax.h new file mode 100644 index 0000000000..a992da2dd1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/asm-syntax.h @@ -0,0 +1,24 @@ +/* Definitions for x86 syntax variations. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. Its master source is NOT part of + the C library, however. The master source lives in the GNU MP Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#undef ALIGN +#define ALIGN(log) .align 1<<log + +#undef L +#define L(body) .L##body diff --git a/REORG.TODO/sysdeps/i386/atomic-machine.h b/REORG.TODO/sysdeps/i386/atomic-machine.h new file mode 100644 index 0000000000..0e24200617 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/atomic-machine.h @@ -0,0 +1,545 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdint.h> +#include <tls.h> /* For tcbhead_t. */ + + +typedef int8_t atomic8_t; +typedef uint8_t uatomic8_t; +typedef int_fast8_t atomic_fast8_t; +typedef uint_fast8_t uatomic_fast8_t; + +typedef int16_t atomic16_t; +typedef uint16_t uatomic16_t; +typedef int_fast16_t atomic_fast16_t; +typedef uint_fast16_t uatomic_fast16_t; + +typedef int32_t atomic32_t; +typedef uint32_t uatomic32_t; +typedef int_fast32_t atomic_fast32_t; +typedef uint_fast32_t uatomic_fast32_t; + +typedef int64_t atomic64_t; +typedef uint64_t uatomic64_t; +typedef int_fast64_t atomic_fast64_t; +typedef uint_fast64_t uatomic_fast64_t; + +typedef intptr_t atomicptr_t; +typedef uintptr_t uatomicptr_t; +typedef intmax_t atomic_max_t; +typedef uintmax_t uatomic_max_t; + + +#ifndef LOCK_PREFIX +# ifdef UP +# define LOCK_PREFIX /* nothing */ +# else +# define LOCK_PREFIX "lock;" +# endif +#endif + +#define __HAVE_64B_ATOMICS 0 +#define USE_ATOMIC_COMPILER_BUILTINS 0 +#define ATOMIC_EXCHANGE_USES_CAS 0 + + +#define atomic_compare_and_exchange_val_acq(mem, newval, oldval) \ + __sync_val_compare_and_swap (mem, oldval, newval) +#define atomic_compare_and_exchange_bool_acq(mem, newval, oldval) \ + (! __sync_bool_compare_and_swap (mem, oldval, newval)) + + +#define __arch_c_compare_and_exchange_val_8_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile ("cmpl $0, %%gs:%P5\n\t" \ + "je 0f\n\t" \ + "lock\n" \ + "0:\tcmpxchgb %b2, %1" \ + : "=a" (ret), "=m" (*mem) \ + : "q" (newval), "m" (*mem), "0" (oldval), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + ret; }) + +#define __arch_c_compare_and_exchange_val_16_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile ("cmpl $0, %%gs:%P5\n\t" \ + "je 0f\n\t" \ + "lock\n" \ + "0:\tcmpxchgw %w2, %1" \ + : "=a" (ret), "=m" (*mem) \ + : "r" (newval), "m" (*mem), "0" (oldval), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + ret; }) + +#define __arch_c_compare_and_exchange_val_32_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile ("cmpl $0, %%gs:%P5\n\t" \ + "je 0f\n\t" \ + "lock\n" \ + "0:\tcmpxchgl %2, %1" \ + : "=a" (ret), "=m" (*mem) \ + : "r" (newval), "m" (*mem), "0" (oldval), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + ret; }) + +/* XXX We do not really need 64-bit compare-and-exchange. At least + not in the moment. Using it would mean causing portability + problems since not many other 32-bit architectures have support for + such an operation. So don't define any code for now. If it is + really going to be used the code below can be used on Intel Pentium + and later, but NOT on i486. */ +#if 1 +# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret = *(mem); \ + abort (); \ + ret = (newval); \ + ret = (oldval); \ + ret; }) +# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret = *(mem); \ + abort (); \ + ret = (newval); \ + ret = (oldval); \ + ret; }) +#else +# ifdef __PIC__ +# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile ("xchgl %2, %%ebx\n\t" \ + LOCK_PREFIX "cmpxchg8b %1\n\t" \ + "xchgl %2, %%ebx" \ + : "=A" (ret), "=m" (*mem) \ + : "DS" (((unsigned long long int) (newval)) \ + & 0xffffffff), \ + "c" (((unsigned long long int) (newval)) >> 32), \ + "m" (*mem), "a" (((unsigned long long int) (oldval)) \ + & 0xffffffff), \ + "d" (((unsigned long long int) (oldval)) >> 32)); \ + ret; }) + +# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile ("xchgl %2, %%ebx\n\t" \ + "cmpl $0, %%gs:%P7\n\t" \ + "je 0f\n\t" \ + "lock\n" \ + "0:\tcmpxchg8b %1\n\t" \ + "xchgl %2, %%ebx" \ + : "=A" (ret), "=m" (*mem) \ + : "DS" (((unsigned long long int) (newval)) \ + & 0xffffffff), \ + "c" (((unsigned long long int) (newval)) >> 32), \ + "m" (*mem), "a" (((unsigned long long int) (oldval)) \ + & 0xffffffff), \ + "d" (((unsigned long long int) (oldval)) >> 32), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + ret; }) +# else +# define __arch_compare_and_exchange_val_64_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile (LOCK_PREFIX "cmpxchg8b %1" \ + : "=A" (ret), "=m" (*mem) \ + : "b" (((unsigned long long int) (newval)) \ + & 0xffffffff), \ + "c" (((unsigned long long int) (newval)) >> 32), \ + "m" (*mem), "a" (((unsigned long long int) (oldval)) \ + & 0xffffffff), \ + "d" (((unsigned long long int) (oldval)) >> 32)); \ + ret; }) + +# define __arch_c_compare_and_exchange_val_64_acq(mem, newval, oldval) \ + ({ __typeof (*mem) ret; \ + __asm __volatile ("cmpl $0, %%gs:%P7\n\t" \ + "je 0f\n\t" \ + "lock\n" \ + "0:\tcmpxchg8b %1" \ + : "=A" (ret), "=m" (*mem) \ + : "b" (((unsigned long long int) (newval)) \ + & 0xffffffff), \ + "c" (((unsigned long long int) (newval)) >> 32), \ + "m" (*mem), "a" (((unsigned long long int) (oldval)) \ + & 0xffffffff), \ + "d" (((unsigned long long int) (oldval)) >> 32), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + ret; }) +# endif +#endif + + +/* Note that we need no lock prefix. */ +#define atomic_exchange_acq(mem, newvalue) \ + ({ __typeof (*mem) result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile ("xchgb %b0, %1" \ + : "=q" (result), "=m" (*mem) \ + : "0" (newvalue), "m" (*mem)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile ("xchgw %w0, %1" \ + : "=r" (result), "=m" (*mem) \ + : "0" (newvalue), "m" (*mem)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile ("xchgl %0, %1" \ + : "=r" (result), "=m" (*mem) \ + : "0" (newvalue), "m" (*mem)); \ + else \ + { \ + result = 0; \ + abort (); \ + } \ + result; }) + + +#define __arch_exchange_and_add_body(lock, pfx, mem, value) \ + ({ __typeof (*mem) __result; \ + __typeof (value) __addval = (value); \ + if (sizeof (*mem) == 1) \ + __asm __volatile (lock "xaddb %b0, %1" \ + : "=q" (__result), "=m" (*mem) \ + : "0" (__addval), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "xaddw %w0, %1" \ + : "=r" (__result), "=m" (*mem) \ + : "0" (__addval), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "xaddl %0, %1" \ + : "=r" (__result), "=m" (*mem) \ + : "0" (__addval), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + { \ + __typeof (mem) __memp = (mem); \ + __typeof (*mem) __tmpval; \ + __result = *__memp; \ + do \ + __tmpval = __result; \ + while ((__result = pfx##_compare_and_exchange_val_64_acq \ + (__memp, __result + __addval, __result)) == __tmpval); \ + } \ + __result; }) + +#define atomic_exchange_and_add(mem, value) \ + __sync_fetch_and_add (mem, value) + +#define __arch_exchange_and_add_cprefix \ + "cmpl $0, %%gs:%P4\n\tje 0f\n\tlock\n0:\t" + +#define catomic_exchange_and_add(mem, value) \ + __arch_exchange_and_add_body (__arch_exchange_and_add_cprefix, __arch_c, \ + mem, value) + + +#define __arch_add_body(lock, pfx, mem, value) \ + do { \ + if (__builtin_constant_p (value) && (value) == 1) \ + atomic_increment (mem); \ + else if (__builtin_constant_p (value) && (value) == -1) \ + atomic_decrement (mem); \ + else if (sizeof (*mem) == 1) \ + __asm __volatile (lock "addb %b1, %0" \ + : "=m" (*mem) \ + : "iq" (value), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "addw %w1, %0" \ + : "=m" (*mem) \ + : "ir" (value), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "addl %1, %0" \ + : "=m" (*mem) \ + : "ir" (value), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + { \ + __typeof (value) __addval = (value); \ + __typeof (mem) __memp = (mem); \ + __typeof (*mem) __oldval = *__memp; \ + __typeof (*mem) __tmpval; \ + do \ + __tmpval = __oldval; \ + while ((__oldval = pfx##_compare_and_exchange_val_64_acq \ + (__memp, __oldval + __addval, __oldval)) == __tmpval); \ + } \ + } while (0) + +#define atomic_add(mem, value) \ + __arch_add_body (LOCK_PREFIX, __arch, mem, value) + +#define __arch_add_cprefix \ + "cmpl $0, %%gs:%P3\n\tje 0f\n\tlock\n0:\t" + +#define catomic_add(mem, value) \ + __arch_add_body (__arch_add_cprefix, __arch_c, mem, value) + + +#define atomic_add_negative(mem, value) \ + ({ unsigned char __result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "addb %b2, %0; sets %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "iq" (value), "m" (*mem)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "addw %w2, %0; sets %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "ir" (value), "m" (*mem)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "addl %2, %0; sets %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "ir" (value), "m" (*mem)); \ + else \ + abort (); \ + __result; }) + + +#define atomic_add_zero(mem, value) \ + ({ unsigned char __result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "addb %b2, %0; setz %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "iq" (value), "m" (*mem)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "addw %w2, %0; setz %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "ir" (value), "m" (*mem)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "addl %2, %0; setz %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "ir" (value), "m" (*mem)); \ + else \ + abort (); \ + __result; }) + + +#define __arch_increment_body(lock, pfx, mem) \ + do { \ + if (sizeof (*mem) == 1) \ + __asm __volatile (lock "incb %b0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "incw %w0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "incl %0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + { \ + __typeof (mem) __memp = (mem); \ + __typeof (*mem) __oldval = *__memp; \ + __typeof (*mem) __tmpval; \ + do \ + __tmpval = __oldval; \ + while ((__oldval = pfx##_compare_and_exchange_val_64_acq \ + (__memp, __oldval + 1, __oldval)) == __tmpval); \ + } \ + } while (0) + +#define atomic_increment(mem) __arch_increment_body (LOCK_PREFIX, __arch, mem) + +#define __arch_increment_cprefix \ + "cmpl $0, %%gs:%P2\n\tje 0f\n\tlock\n0:\t" + +#define catomic_increment(mem) \ + __arch_increment_body (__arch_increment_cprefix, __arch_c, mem) + + +#define atomic_increment_and_test(mem) \ + ({ unsigned char __result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "incb %0; sete %b1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "incw %0; sete %w1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "incl %0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else \ + abort (); \ + __result; }) + + +#define __arch_decrement_body(lock, pfx, mem) \ + do { \ + if (sizeof (*mem) == 1) \ + __asm __volatile (lock "decb %b0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "decw %w0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "decl %0" \ + : "=m" (*mem) \ + : "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + { \ + __typeof (mem) __memp = (mem); \ + __typeof (*mem) __oldval = *__memp; \ + __typeof (*mem) __tmpval; \ + do \ + __tmpval = __oldval; \ + while ((__oldval = pfx##_compare_and_exchange_val_64_acq \ + (__memp, __oldval - 1, __oldval)) == __tmpval); \ + } \ + } while (0) + +#define atomic_decrement(mem) __arch_decrement_body (LOCK_PREFIX, __arch, mem) + +#define __arch_decrement_cprefix \ + "cmpl $0, %%gs:%P2\n\tje 0f\n\tlock\n0:\t" + +#define catomic_decrement(mem) \ + __arch_decrement_body (__arch_decrement_cprefix, __arch_c, mem) + + +#define atomic_decrement_and_test(mem) \ + ({ unsigned char __result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "decb %b0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "decw %w0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "decl %0; sete %1" \ + : "=m" (*mem), "=qm" (__result) \ + : "m" (*mem)); \ + else \ + abort (); \ + __result; }) + + +#define atomic_bit_set(mem, bit) \ + do { \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "orb %b2, %0" \ + : "=m" (*mem) \ + : "m" (*mem), "iq" (1 << (bit))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "orw %w2, %0" \ + : "=m" (*mem) \ + : "m" (*mem), "ir" (1 << (bit))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "orl %2, %0" \ + : "=m" (*mem) \ + : "m" (*mem), "ir" (1 << (bit))); \ + else \ + abort (); \ + } while (0) + + +#define atomic_bit_test_set(mem, bit) \ + ({ unsigned char __result; \ + if (sizeof (*mem) == 1) \ + __asm __volatile (LOCK_PREFIX "btsb %3, %1; setc %0" \ + : "=q" (__result), "=m" (*mem) \ + : "m" (*mem), "ir" (bit)); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (LOCK_PREFIX "btsw %3, %1; setc %0" \ + : "=q" (__result), "=m" (*mem) \ + : "m" (*mem), "ir" (bit)); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (LOCK_PREFIX "btsl %3, %1; setc %0" \ + : "=q" (__result), "=m" (*mem) \ + : "m" (*mem), "ir" (bit)); \ + else \ + abort (); \ + __result; }) + + +#define atomic_spin_nop() asm ("rep; nop") + + +#define __arch_and_body(lock, mem, mask) \ + do { \ + if (sizeof (*mem) == 1) \ + __asm __volatile (lock "andb %b1, %0" \ + : "=m" (*mem) \ + : "iq" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "andw %w1, %0" \ + : "=m" (*mem) \ + : "ir" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "andl %1, %0" \ + : "=m" (*mem) \ + : "ir" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + abort (); \ + } while (0) + +#define __arch_cprefix \ + "cmpl $0, %%gs:%P3\n\tje 0f\n\tlock\n0:\t" + +#define atomic_and(mem, mask) __arch_and_body (LOCK_PREFIX, mem, mask) + +#define catomic_and(mem, mask) __arch_and_body (__arch_cprefix, mem, mask) + + +#define __arch_or_body(lock, mem, mask) \ + do { \ + if (sizeof (*mem) == 1) \ + __asm __volatile (lock "orb %b1, %0" \ + : "=m" (*mem) \ + : "iq" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 2) \ + __asm __volatile (lock "orw %w1, %0" \ + : "=m" (*mem) \ + : "ir" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else if (sizeof (*mem) == 4) \ + __asm __volatile (lock "orl %1, %0" \ + : "=m" (*mem) \ + : "ir" (mask), "m" (*mem), \ + "i" (offsetof (tcbhead_t, multiple_threads))); \ + else \ + abort (); \ + } while (0) + +#define atomic_or(mem, mask) __arch_or_body (LOCK_PREFIX, mem, mask) + +#define catomic_or(mem, mask) __arch_or_body (__arch_cprefix, mem, mask) + +/* We don't use mfence because it is supposedly slower due to having to + provide stronger guarantees (e.g., regarding self-modifying code). */ +#define atomic_full_barrier() \ + __asm __volatile (LOCK_PREFIX "orl $0, (%%esp)" ::: "memory") +#define atomic_read_barrier() __asm ("" ::: "memory") +#define atomic_write_barrier() __asm ("" ::: "memory") diff --git a/REORG.TODO/sysdeps/i386/backtrace.c b/REORG.TODO/sysdeps/i386/backtrace.c new file mode 100644 index 0000000000..ee8238d0ce --- /dev/null +++ b/REORG.TODO/sysdeps/i386/backtrace.c @@ -0,0 +1,163 @@ +/* Return backtrace of current program state. + Copyright (C) 1998-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <libc-lock.h> +#include <dlfcn.h> +#include <execinfo.h> +#include <stdlib.h> +#include <unwind.h> + +struct trace_arg +{ + void **array; + int cnt, size; + void *lastebp, *lastesp; +}; + +#ifdef SHARED +static _Unwind_Reason_Code (*unwind_backtrace) (_Unwind_Trace_Fn, void *); +static _Unwind_Ptr (*unwind_getip) (struct _Unwind_Context *); +static _Unwind_Ptr (*unwind_getcfa) (struct _Unwind_Context *); +static _Unwind_Ptr (*unwind_getgr) (struct _Unwind_Context *, int); +static void *libgcc_handle; + +static void +init (void) +{ + libgcc_handle = __libc_dlopen ("libgcc_s.so.1"); + + if (libgcc_handle == NULL) + return; + + unwind_backtrace = __libc_dlsym (libgcc_handle, "_Unwind_Backtrace"); + unwind_getip = __libc_dlsym (libgcc_handle, "_Unwind_GetIP"); + unwind_getcfa = __libc_dlsym (libgcc_handle, "_Unwind_GetCFA"); + unwind_getgr = __libc_dlsym (libgcc_handle, "_Unwind_GetGR"); + if (unwind_getip == NULL || unwind_getgr == NULL || unwind_getcfa == NULL) + { + unwind_backtrace = NULL; + __libc_dlclose (libgcc_handle); + libgcc_handle = NULL; + } +} +#else +# define unwind_backtrace _Unwind_Backtrace +# define unwind_getip _Unwind_GetIP +# define unwind_getcfa _Unwind_GetCFA +# define unwind_getgr _Unwind_GetGR +#endif + +static _Unwind_Reason_Code +backtrace_helper (struct _Unwind_Context *ctx, void *a) +{ + struct trace_arg *arg = a; + + /* We are first called with address in the __backtrace function. + Skip it. */ + if (arg->cnt != -1) + arg->array[arg->cnt] = (void *) unwind_getip (ctx); + if (++arg->cnt == arg->size) + return _URC_END_OF_STACK; + + /* %ebp is DWARF2 register 5 on IA-32. */ + arg->lastebp = (void *) unwind_getgr (ctx, 5); + arg->lastesp = (void *) unwind_getcfa (ctx); + return _URC_NO_REASON; +} + + +/* This is a global variable set at program start time. It marks the + highest used stack address. */ +extern void *__libc_stack_end; + + +/* This is the stack layout we see with every stack frame + if not compiled without frame pointer. + + +-----------------+ +-----------------+ + %ebp -> | %ebp last frame--------> | %ebp last frame--->... + | | | | + | return address | | return address | + +-----------------+ +-----------------+ + + First try as far to get as far as possible using + _Unwind_Backtrace which handles -fomit-frame-pointer + as well, but requires .eh_frame info. Then fall back to + walking the stack manually. */ + +struct layout +{ + struct layout *ebp; + void *ret; +}; + + +int +__backtrace (void **array, int size) +{ + struct trace_arg arg = { .array = array, .size = size, .cnt = -1 }; + + if (size <= 0) + return 0; + +#ifdef SHARED + __libc_once_define (static, once); + + __libc_once (once, init); + if (unwind_backtrace == NULL) + return 0; +#endif + + unwind_backtrace (backtrace_helper, &arg); + + if (arg.cnt > 1 && arg.array[arg.cnt - 1] == NULL) + --arg.cnt; + else if (arg.cnt < size) + { + struct layout *ebp = (struct layout *) arg.lastebp; + + while (arg.cnt < size) + { + /* Check for out of range. */ + if ((void *) ebp < arg.lastesp || (void *) ebp > __libc_stack_end + || ((long) ebp & 3)) + break; + + array[arg.cnt++] = ebp->ret; + ebp = ebp->ebp; + } + } + return arg.cnt != -1 ? arg.cnt : 0; +} +weak_alias (__backtrace, backtrace) +libc_hidden_def (__backtrace) + + +#ifdef SHARED +/* Free all resources if necessary. */ +libc_freeres_fn (free_mem) +{ + unwind_backtrace = NULL; + if (libgcc_handle != NULL) + { + __libc_dlclose (libgcc_handle); + libgcc_handle = NULL; + } +} +#endif diff --git a/REORG.TODO/sysdeps/i386/bcopy.S b/REORG.TODO/sysdeps/i386/bcopy.S new file mode 100644 index 0000000000..12b8ddb886 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/bcopy.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY bcopy +#include "memcpy.S" diff --git a/REORG.TODO/sysdeps/i386/bsd-_setjmp.S b/REORG.TODO/sysdeps/i386/bsd-_setjmp.S new file mode 100644 index 0000000000..6496304946 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/bsd-_setjmp.S @@ -0,0 +1,56 @@ +/* BSD `_setjmp' entry point to `sigsetjmp (..., 0)'. i386 version. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* This just does a tail-call to `__sigsetjmp (ARG, 0)'. + We cannot do it in C because it must be a tail-call, so frame-unwinding + in setjmp doesn't clobber the state restored by longjmp. */ + +#include <sysdep.h> +#include <jmpbuf-offsets.h> +#include <stap-probe.h> + +#define PARMS 4 /* no space for saved regs */ +#define JMPBUF PARMS +#define SIGMSK JMPBUF+4 + +ENTRY (_setjmp) + + xorl %eax, %eax + movl JMPBUF(%esp), %edx + + /* Save registers. */ + movl %ebx, (JB_BX*4)(%edx) + movl %esi, (JB_SI*4)(%edx) + movl %edi, (JB_DI*4)(%edx) + leal JMPBUF(%esp), %ecx /* Save SP as it will be after we return. */ +#ifdef PTR_MANGLE + PTR_MANGLE (%ecx) +#endif + movl %ecx, (JB_SP*4)(%edx) + movl 0(%esp), %ecx /* Save PC we are returning to now. */ + LIBC_PROBE (setjmp, 3, 4@%edx, -4@$0, 4@%ecx) +#ifdef PTR_MANGLE + PTR_MANGLE (%ecx) +#endif + movl %ecx, (JB_PC*4)(%edx) + movl %ebp, (JB_BP*4)(%edx) /* Save caller's frame pointer. */ + + movl %eax, JB_SIZE(%edx) /* No signal mask set. */ + ret +END (_setjmp) +libc_hidden_def (_setjmp) diff --git a/REORG.TODO/sysdeps/i386/bsd-setjmp.S b/REORG.TODO/sysdeps/i386/bsd-setjmp.S new file mode 100644 index 0000000000..5710e1f42b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/bsd-setjmp.S @@ -0,0 +1,66 @@ +/* BSD `setjmp' entry point to `sigsetjmp (..., 1)'. i386 version. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* This just does a tail-call to `__sigsetjmp (ARG, 1)'. + We cannot do it in C because it must be a tail-call, so frame-unwinding + in setjmp doesn't clobber the state restored by longjmp. */ + +#include <sysdep.h> +#include <jmpbuf-offsets.h> +#include <stap-probe.h> + +#define PARMS 4 /* no space for saved regs */ +#define JMPBUF PARMS +#define SIGMSK JMPBUF+4 + +ENTRY (setjmp) + /* Note that we have to use a non-exported symbol in the next + jump since otherwise gas will emit it as a jump through the + PLT which is what we cannot use here. */ + + movl JMPBUF(%esp), %eax + + /* Save registers. */ + movl %ebx, (JB_BX*4)(%eax) + movl %esi, (JB_SI*4)(%eax) + movl %edi, (JB_DI*4)(%eax) + leal JMPBUF(%esp), %ecx /* Save SP as it will be after we return. */ +#ifdef PTR_MANGLE + PTR_MANGLE (%ecx) +#endif + movl %ecx, (JB_SP*4)(%eax) + movl 0(%esp), %ecx /* Save PC we are returning to now. */ + LIBC_PROBE (setjmp, 3, 4@%eax, -4@$1, 4@%ecx) +#ifdef PTR_MANGLE + PTR_MANGLE (%ecx) +#endif + movl %ecx, (JB_PC*4)(%eax) + movl %ebp, (JB_BP*4)(%eax) /* Save caller's frame pointer. */ + + /* Call __sigjmp_save. */ + pushl $1 + cfi_adjust_cfa_offset (4) + pushl 8(%esp) + cfi_adjust_cfa_offset (4) + call __sigjmp_save + popl %ecx + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + ret +END (setjmp) diff --git a/REORG.TODO/sysdeps/i386/bzero.S b/REORG.TODO/sysdeps/i386/bzero.S new file mode 100644 index 0000000000..c8dd47b4da --- /dev/null +++ b/REORG.TODO/sysdeps/i386/bzero.S @@ -0,0 +1,5 @@ +#define USE_AS_BZERO +#define memset __bzero +#include "memset.S" + +weak_alias (__bzero, bzero) diff --git a/REORG.TODO/sysdeps/i386/cacheinfo.c b/REORG.TODO/sysdeps/i386/cacheinfo.c new file mode 100644 index 0000000000..f15fe0779a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/cacheinfo.c @@ -0,0 +1,3 @@ +#define DISABLE_PREFETCHW + +#include <sysdeps/x86/cacheinfo.c> diff --git a/REORG.TODO/sysdeps/i386/configure b/REORG.TODO/sysdeps/i386/configure new file mode 100644 index 0000000000..5b55c5affe --- /dev/null +++ b/REORG.TODO/sysdeps/i386/configure @@ -0,0 +1,84 @@ +# This file is generated from configure.ac by Autoconf. DO NOT EDIT! + # Local configure fragment for sysdeps/i386. + +# We no longer support i386 since it lacks the atomic instructions +# required to implement NPTL threading. +if test "$config_machine" = i386; then + as_fn_error $? " +*** ERROR: Support for i386 is deprecated. +*** Please use host i786, i686, i585 or i486. +*** For example: /src/glibc/configure --host=i686-pc-linux-gnu ...\"" "$LINENO" 5 +fi + +# The GNU C Library can't be built for i386. There are several reasons for +# this restriction. The primary reason is that i386 lacks the atomic +# operations required to support the current NPTL implementation. While it is +# possible that such atomic operations could be emulated in the kernel to date +# no such work has been done to enable this. Even with NPTL disabled you still +# have no atomic.h implementation. Given the declining use of i386 we disable +# support for building with `-march=i386' or `-mcpu=i386.' We don't explicitly +# check for i386, instead we make sure the compiler has support for inlining +# the builtin __sync_val_compare_and_swap. If it does then we should have no +# problem building for i386. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for compiler support of inlined builtin function __sync_val_compare_and_swap" >&5 +$as_echo_n "checking for compiler support of inlined builtin function __sync_val_compare_and_swap... " >&6; } +libc_compiler_builtin_inlined=no +cat > conftest.c <<EOF +int _start (void) { int a, b, c; __sync_val_compare_and_swap (&a, b, c); return 0; } +EOF +if ! { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS + -O0 -nostdlib -nostartfiles + -S conftest.c -o - | fgrep "__sync_val_compare_and_swap" + 1>&5' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; } +then + libc_compiler_builtin_inlined=yes +fi +rm -f conftest* +if test $libc_compiler_builtin_inlined = yes; then + libc_cv_unsupported_i386=no +else + as_fn_error $? " +*** Building with -march=i386/-mcpu=i386 is not supported. +*** Please use host i786, i686, i586, or i486. +*** For example: /source/glibc/configure CFLAGS='-O2 -march=i686' ..." "$LINENO" 5 +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_compiler_builtin_inlined" >&5 +$as_echo "$libc_compiler_builtin_inlined" >&6; } + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5 +$as_echo_n "checking for Intel MPX support... " >&6; } +if ${libc_cv_asm_mpx+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.s <<\EOF + bndmov %bnd0,(%esp) +EOF +if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5' + { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then + libc_cv_asm_mpx=yes +else + libc_cv_asm_mpx=no +fi +rm -f conftest* +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5 +$as_echo "$libc_cv_asm_mpx" >&6; } +if test $libc_cv_asm_mpx = yes; then + $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h + +fi + +$as_echo "#define USE_REGPARMS 1" >>confdefs.h + + +$as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h + diff --git a/REORG.TODO/sysdeps/i386/configure.ac b/REORG.TODO/sysdeps/i386/configure.ac new file mode 100644 index 0000000000..19ef33f34a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/configure.ac @@ -0,0 +1,52 @@ +GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. +# Local configure fragment for sysdeps/i386. + +# We no longer support i386 since it lacks the atomic instructions +# required to implement NPTL threading. +if test "$config_machine" = i386; then + AC_MSG_ERROR([ +*** ERROR: Support for i386 is deprecated. +*** Please use host i786, i686, i585 or i486. +*** For example: /src/glibc/configure --host=i686-pc-linux-gnu ..."]) +fi + +# The GNU C Library can't be built for i386. There are several reasons for +# this restriction. The primary reason is that i386 lacks the atomic +# operations required to support the current NPTL implementation. While it is +# possible that such atomic operations could be emulated in the kernel to date +# no such work has been done to enable this. Even with NPTL disabled you still +# have no atomic.h implementation. Given the declining use of i386 we disable +# support for building with `-march=i386' or `-mcpu=i386.' We don't explicitly +# check for i386, instead we make sure the compiler has support for inlining +# the builtin __sync_val_compare_and_swap. If it does then we should have no +# problem building for i386. +LIBC_COMPILER_BUILTIN_INLINED( + [__sync_val_compare_and_swap], + [int a, b, c; __sync_val_compare_and_swap (&a, b, c);], + [-O0], + [libc_cv_unsupported_i386=no], + [AC_MSG_ERROR([ +*** Building with -march=i386/-mcpu=i386 is not supported. +*** Please use host i786, i686, i586, or i486. +*** For example: /source/glibc/configure CFLAGS='-O2 -march=i686' ...])]) + +dnl Check whether asm supports Intel MPX +AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl +cat > conftest.s <<\EOF + bndmov %bnd0,(%esp) +EOF +if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then + libc_cv_asm_mpx=yes +else + libc_cv_asm_mpx=no +fi +rm -f conftest*]) +if test $libc_cv_asm_mpx = yes; then + AC_DEFINE(HAVE_MPX_SUPPORT) +fi + +AC_DEFINE(USE_REGPARMS) + +dnl It is always possible to access static and hidden symbols in an +dnl position independent way. +AC_DEFINE(PI_STATIC_AND_HIDDEN) diff --git a/REORG.TODO/sysdeps/i386/crti.S b/REORG.TODO/sysdeps/i386/crti.S new file mode 100644 index 0000000000..f800209990 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/crti.S @@ -0,0 +1,84 @@ +/* Special .init and .fini section support for x86. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file with other + programs, and to distribute those programs without any restriction + coming from the use of this file. (The GNU Lesser General Public + License restrictions do apply in other respects; for example, they + cover modification of the file, and distribution when not linked + into another program.) + + Note that people who make modified versions of this file are not + obligated to grant this special exception for their modified + versions; it is their choice whether to do so. The GNU Lesser + General Public License gives permission to release a modified + version without this exception; this exception also makes it + possible to release a modified version which carries forward this + exception. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* crti.S puts a function prologue at the beginning of the .init and + .fini sections and defines global symbols for those addresses, so + they can be called as functions. The symbols _init and _fini are + magic and cause the linker to emit DT_INIT and DT_FINI. */ + +#include <libc-symbols.h> +#include <sysdep.h> + +#ifndef PREINIT_FUNCTION +# define PREINIT_FUNCTION __gmon_start__ +#endif + +#ifndef PREINIT_FUNCTION_WEAK +# define PREINIT_FUNCTION_WEAK 1 +#endif + +#if PREINIT_FUNCTION_WEAK + weak_extern (PREINIT_FUNCTION) +#else + .hidden PREINIT_FUNCTION +#endif + + .section .init,"ax",@progbits + .p2align 2 + .globl _init + .type _init, @function +_init: + pushl %ebx + /* Maintain 16-byte stack alignment for called functions. */ + subl $8, %esp + LOAD_PIC_REG (bx) +#if PREINIT_FUNCTION_WEAK + movl PREINIT_FUNCTION@GOT(%ebx), %eax + testl %eax, %eax + je .Lno_weak_fn + call PREINIT_FUNCTION@PLT +.Lno_weak_fn: +#else + call PREINIT_FUNCTION +#endif + + .section .fini,"ax",@progbits + .p2align 2 + .globl _fini + .type _fini, @function +_fini: + pushl %ebx + subl $8, %esp + LOAD_PIC_REG (bx) diff --git a/REORG.TODO/sysdeps/i386/crtn.S b/REORG.TODO/sysdeps/i386/crtn.S new file mode 100644 index 0000000000..b18b9c171a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/crtn.S @@ -0,0 +1,47 @@ +/* Special .init and .fini section support for x86. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file with other + programs, and to distribute those programs without any restriction + coming from the use of this file. (The GNU Lesser General Public + License restrictions do apply in other respects; for example, they + cover modification of the file, and distribution when not linked + into another program.) + + Note that people who make modified versions of this file are not + obligated to grant this special exception for their modified + versions; it is their choice whether to do so. The GNU Lesser + General Public License gives permission to release a modified + version without this exception; this exception also makes it + possible to release a modified version which carries forward this + exception. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* crtn.S puts function epilogues in the .init and .fini sections + corresponding to the prologues in crti.S. */ + + .section .init,"ax",@progbits + addl $8, %esp + popl %ebx + ret + + .section .fini,"ax",@progbits + addl $8, %esp + popl %ebx + ret diff --git a/REORG.TODO/sysdeps/i386/dl-irel.h b/REORG.TODO/sysdeps/i386/dl-irel.h new file mode 100644 index 0000000000..824e81aed1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/dl-irel.h @@ -0,0 +1,51 @@ +/* Machine-dependent ELF indirect relocation inline functions. + i386 version. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _DL_IREL_H +#define _DL_IREL_H + +#include <stdio.h> +#include <unistd.h> + +#define ELF_MACHINE_IREL 1 + +static inline Elf32_Addr +__attribute ((always_inline)) +elf_ifunc_invoke (Elf32_Addr addr) +{ + return ((Elf32_Addr (*) (void)) (addr)) (); +} + +static inline void +__attribute ((always_inline)) +elf_irel (const Elf32_Rel *reloc) +{ + Elf32_Addr *const reloc_addr = (void *) reloc->r_offset; + const unsigned long int r_type = ELF32_R_TYPE (reloc->r_info); + + if (__glibc_likely (r_type == R_386_IRELATIVE)) + { + Elf32_Addr value = elf_ifunc_invoke(*reloc_addr); + *reloc_addr = value; + } + else + __libc_fatal ("unexpected reloc type in static binary"); +} + +#endif /* dl-irel.h */ diff --git a/REORG.TODO/sysdeps/i386/dl-lookupcfg.h b/REORG.TODO/sysdeps/i386/dl-lookupcfg.h new file mode 100644 index 0000000000..47b534a059 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/dl-lookupcfg.h @@ -0,0 +1,32 @@ +/* Configuration of lookup functions. + Copyright (C) 2005-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define DL_UNMAP_IS_SPECIAL + +#include_next <dl-lookupcfg.h> + +/* Address of protected data defined in the shared library may be + external due to copy relocation. */ +#define DL_EXTERN_PROTECTED_DATA + +struct link_map; + +extern void _dl_unmap (struct link_map *map) + internal_function attribute_hidden; + +#define DL_UNMAP(map) _dl_unmap (map) diff --git a/REORG.TODO/sysdeps/i386/dl-machine.h b/REORG.TODO/sysdeps/i386/dl-machine.h new file mode 100644 index 0000000000..57d4a0bdbd --- /dev/null +++ b/REORG.TODO/sysdeps/i386/dl-machine.h @@ -0,0 +1,757 @@ +/* Machine-dependent ELF dynamic relocation inline functions. i386 version. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef dl_machine_h +#define dl_machine_h + +#define ELF_MACHINE_NAME "i386" + +#include <sys/param.h> +#include <sysdep.h> +#include <tls.h> +#include <dl-tlsdesc.h> +#include <cpu-features.c> + +/* Return nonzero iff ELF header is compatible with the running host. */ +static inline int __attribute__ ((unused)) +elf_machine_matches_host (const Elf32_Ehdr *ehdr) +{ + return ehdr->e_machine == EM_386; +} + + +/* Return the link-time address of _DYNAMIC. Conveniently, this is the + first element of the GOT, a special entry that is never relocated. */ +static inline Elf32_Addr __attribute__ ((unused, const)) +elf_machine_dynamic (void) +{ + /* This produces a GOTOFF reloc that resolves to zero at link time, so in + fact just loads from the GOT register directly. By doing it without + an asm we can let the compiler choose any register. */ + extern const Elf32_Addr _GLOBAL_OFFSET_TABLE_[] attribute_hidden; + return _GLOBAL_OFFSET_TABLE_[0]; +} + +/* Return the run-time load address of the shared object. */ +static inline Elf32_Addr __attribute__ ((unused)) +elf_machine_load_address (void) +{ + /* Compute the difference between the runtime address of _DYNAMIC as seen + by a GOTOFF reference, and the link-time address found in the special + unrelocated first GOT entry. */ + extern Elf32_Dyn bygotoff[] asm ("_DYNAMIC") attribute_hidden; + return (Elf32_Addr) &bygotoff - elf_machine_dynamic (); +} + +/* Set up the loaded object described by L so its unrelocated PLT + entries will jump to the on-demand fixup code in dl-runtime.c. */ + +static inline int __attribute__ ((unused, always_inline)) +elf_machine_runtime_setup (struct link_map *l, int lazy, int profile) +{ + Elf32_Addr *got; + extern void _dl_runtime_resolve (Elf32_Word) attribute_hidden; + extern void _dl_runtime_profile (Elf32_Word) attribute_hidden; + + if (l->l_info[DT_JMPREL] && lazy) + { + /* The GOT entries for functions in the PLT have not yet been filled + in. Their initial contents will arrange when called to push an + offset into the .rel.plt section, push _GLOBAL_OFFSET_TABLE_[1], + and then jump to _GLOBAL_OFFSET_TABLE[2]. */ + got = (Elf32_Addr *) D_PTR (l, l_info[DT_PLTGOT]); + /* If a library is prelinked but we have to relocate anyway, + we have to be able to undo the prelinking of .got.plt. + The prelinker saved us here address of .plt + 0x16. */ + if (got[1]) + { + l->l_mach.plt = got[1] + l->l_addr; + l->l_mach.gotplt = (Elf32_Addr) &got[3]; + } + got[1] = (Elf32_Addr) l; /* Identify this shared object. */ + + /* The got[2] entry contains the address of a function which gets + called to get the address of a so far unresolved function and + jump to it. The profiling extension of the dynamic linker allows + to intercept the calls to collect information. In this case we + don't store the address in the GOT so that all future calls also + end in this function. */ + if (__glibc_unlikely (profile)) + { + got[2] = (Elf32_Addr) &_dl_runtime_profile; + + if (GLRO(dl_profile) != NULL + && _dl_name_match_p (GLRO(dl_profile), l)) + /* This is the object we are looking for. Say that we really + want profiling and the timers are started. */ + GL(dl_profile_map) = l; + } + else + /* This function will get called to fix up the GOT entry indicated by + the offset on the stack, and then jump to the resolved address. */ + got[2] = (Elf32_Addr) &_dl_runtime_resolve; + } + + return lazy; +} + +#ifdef IN_DL_RUNTIME + +# ifndef PROF +/* We add a declaration of this function here so that in dl-runtime.c + the ELF_MACHINE_RUNTIME_TRAMPOLINE macro really can pass the parameters + in registers. + + We cannot use this scheme for profiling because the _mcount call + destroys the passed register information. */ +#define ARCH_FIXUP_ATTRIBUTE __attribute__ ((regparm (3), stdcall, unused)) + +extern ElfW(Addr) _dl_fixup (struct link_map *l, + ElfW(Word) reloc_offset) + ARCH_FIXUP_ATTRIBUTE; +extern ElfW(Addr) _dl_profile_fixup (struct link_map *l, + ElfW(Word) reloc_offset, + ElfW(Addr) retaddr, void *regs, + long int *framesizep) + ARCH_FIXUP_ATTRIBUTE; +# endif + +#endif + +/* Mask identifying addresses reserved for the user program, + where the dynamic linker should not map anything. */ +#define ELF_MACHINE_USER_ADDRESS_MASK 0xf8000000UL + +/* Initial entry point code for the dynamic linker. + The C function `_dl_start' is the real entry point; + its return value is the user program's entry point. */ + +#define RTLD_START asm ("\n\ + .text\n\ + .align 16\n\ +0: movl (%esp), %ebx\n\ + ret\n\ + .align 16\n\ +.globl _start\n\ +.globl _dl_start_user\n\ +_start:\n\ + # Note that _dl_start gets the parameter in %eax.\n\ + movl %esp, %eax\n\ + call _dl_start\n\ +_dl_start_user:\n\ + # Save the user entry point address in %edi.\n\ + movl %eax, %edi\n\ + # Point %ebx at the GOT.\n\ + call 0b\n\ + addl $_GLOBAL_OFFSET_TABLE_, %ebx\n\ + # See if we were run as a command with the executable file\n\ + # name as an extra leading argument.\n\ + movl _dl_skip_args@GOTOFF(%ebx), %eax\n\ + # Pop the original argument count.\n\ + popl %edx\n\ + # Adjust the stack pointer to skip _dl_skip_args words.\n\ + leal (%esp,%eax,4), %esp\n\ + # Subtract _dl_skip_args from argc.\n\ + subl %eax, %edx\n\ + # Push argc back on the stack.\n\ + push %edx\n\ + # The special initializer gets called with the stack just\n\ + # as the application's entry point will see it; it can\n\ + # switch stacks if it moves these contents over.\n\ +" RTLD_START_SPECIAL_INIT "\n\ + # Load the parameters again.\n\ + # (eax, edx, ecx, *--esp) = (_dl_loaded, argc, argv, envp)\n\ + movl _rtld_local@GOTOFF(%ebx), %eax\n\ + leal 8(%esp,%edx,4), %esi\n\ + leal 4(%esp), %ecx\n\ + movl %esp, %ebp\n\ + # Make sure _dl_init is run with 16 byte aligned stack.\n\ + andl $-16, %esp\n\ + pushl %eax\n\ + pushl %eax\n\ + pushl %ebp\n\ + pushl %esi\n\ + # Clear %ebp, so that even constructors have terminated backchain.\n\ + xorl %ebp, %ebp\n\ + # Call the function to run the initializers.\n\ + call _dl_init\n\ + # Pass our finalizer function to the user in %edx, as per ELF ABI.\n\ + leal _dl_fini@GOTOFF(%ebx), %edx\n\ + # Restore %esp _start expects.\n\ + movl (%esp), %esp\n\ + # Jump to the user's entry point.\n\ + jmp *%edi\n\ + .previous\n\ +"); + +#ifndef RTLD_START_SPECIAL_INIT +# define RTLD_START_SPECIAL_INIT /* nothing */ +#endif + +/* ELF_RTYPE_CLASS_PLT iff TYPE describes relocation of a PLT entry or + TLS variable, so undefined references should not be allowed to + define the value. + ELF_RTYPE_CLASS_COPY iff TYPE should not be allowed to resolve to one + of the main executable's symbols, as for a COPY reloc. + ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA iff TYPE describes relocation may + against protected data whose address be external due to copy relocation. + */ +# define elf_machine_type_class(type) \ + ((((type) == R_386_JMP_SLOT || (type) == R_386_TLS_DTPMOD32 \ + || (type) == R_386_TLS_DTPOFF32 || (type) == R_386_TLS_TPOFF32 \ + || (type) == R_386_TLS_TPOFF || (type) == R_386_TLS_DESC) \ + * ELF_RTYPE_CLASS_PLT) \ + | (((type) == R_386_COPY) * ELF_RTYPE_CLASS_COPY) \ + | (((type) == R_386_GLOB_DAT) * ELF_RTYPE_CLASS_EXTERN_PROTECTED_DATA)) + +/* A reloc type used for ld.so cmdline arg lookups to reject PLT entries. */ +#define ELF_MACHINE_JMP_SLOT R_386_JMP_SLOT + +/* The i386 never uses Elf32_Rela relocations for the dynamic linker. + Prelinked libraries may use Elf32_Rela though. */ +#define ELF_MACHINE_PLT_REL 1 + +/* We define an initialization functions. This is called very early in + _dl_sysdep_start. */ +#define DL_PLATFORM_INIT dl_platform_init () + +static inline void __attribute__ ((unused)) +dl_platform_init (void) +{ +#if IS_IN (rtld) + /* init_cpu_features has been called early from __libc_start_main in + static executable. */ + init_cpu_features (&GLRO(dl_x86_cpu_features)); +#else + if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0') + /* Avoid an empty string which would disturb us. */ + GLRO(dl_platform) = NULL; +#endif +} + +static inline Elf32_Addr +elf_machine_fixup_plt (struct link_map *map, lookup_t t, + const Elf32_Rel *reloc, + Elf32_Addr *reloc_addr, Elf32_Addr value) +{ + return *reloc_addr = value; +} + +/* Return the final value of a plt relocation. */ +static inline Elf32_Addr +elf_machine_plt_value (struct link_map *map, const Elf32_Rel *reloc, + Elf32_Addr value) +{ + return value; +} + + +/* Names of the architecture-specific auditing callback functions. */ +#define ARCH_LA_PLTENTER i86_gnu_pltenter +#define ARCH_LA_PLTEXIT i86_gnu_pltexit + +#endif /* !dl_machine_h */ + +/* The i386 never uses Elf32_Rela relocations for the dynamic linker. + Prelinked libraries may use Elf32_Rela though. */ +#define ELF_MACHINE_NO_RELA defined RTLD_BOOTSTRAP +#define ELF_MACHINE_NO_REL 0 + +#ifdef RESOLVE_MAP + +/* Perform the relocation specified by RELOC and SYM (which is fully resolved). + MAP is the object containing the reloc. */ + +auto inline void +__attribute ((always_inline)) +elf_machine_rel (struct link_map *map, const Elf32_Rel *reloc, + const Elf32_Sym *sym, const struct r_found_version *version, + void *const reloc_addr_arg, int skip_ifunc) +{ + Elf32_Addr *const reloc_addr = reloc_addr_arg; + const unsigned int r_type = ELF32_R_TYPE (reloc->r_info); + +# if !defined RTLD_BOOTSTRAP || !defined HAVE_Z_COMBRELOC + if (__glibc_unlikely (r_type == R_386_RELATIVE)) + { +# if !defined RTLD_BOOTSTRAP && !defined HAVE_Z_COMBRELOC + /* This is defined in rtld.c, but nowhere in the static libc.a; + make the reference weak so static programs can still link. + This declaration cannot be done when compiling rtld.c + (i.e. #ifdef RTLD_BOOTSTRAP) because rtld.c contains the + common defn for _dl_rtld_map, which is incompatible with a + weak decl in the same file. */ +# ifndef SHARED + weak_extern (_dl_rtld_map); +# endif + if (map != &GL(dl_rtld_map)) /* Already done in rtld itself. */ +# endif + *reloc_addr += map->l_addr; + } +# ifndef RTLD_BOOTSTRAP + else if (__glibc_unlikely (r_type == R_386_NONE)) + return; +# endif + else +# endif /* !RTLD_BOOTSTRAP and have no -z combreloc */ + { +# ifndef RTLD_BOOTSTRAP + const Elf32_Sym *const refsym = sym; +# endif + struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type); + Elf32_Addr value = sym_map == NULL ? 0 : sym_map->l_addr + sym->st_value; + + if (sym != NULL + && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, + 0) + && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1) + && __builtin_expect (!skip_ifunc, 1)) + { +# ifndef RTLD_BOOTSTRAP + if (sym_map != map + && sym_map->l_type != lt_executable + && !sym_map->l_relocated) + { + const char *strtab + = (const char *) D_PTR (map, l_info[DT_STRTAB]); + _dl_error_printf ("\ +%s: Relink `%s' with `%s' for IFUNC symbol `%s'\n", + RTLD_PROGNAME, map->l_name, + sym_map->l_name, + strtab + refsym->st_name); + } +# endif + value = ((Elf32_Addr (*) (void)) value) (); + } + + switch (r_type) + { +# ifndef RTLD_BOOTSTRAP + case R_386_SIZE32: + /* Set to symbol size plus addend. */ + *reloc_addr += sym->st_size; + break; +# endif + case R_386_GLOB_DAT: + case R_386_JMP_SLOT: + *reloc_addr = value; + break; + + case R_386_TLS_DTPMOD32: +# ifdef RTLD_BOOTSTRAP + /* During startup the dynamic linker is always the module + with index 1. + XXX If this relocation is necessary move before RESOLVE + call. */ + *reloc_addr = 1; +# else + /* Get the information from the link map returned by the + resolv function. */ + if (sym_map != NULL) + *reloc_addr = sym_map->l_tls_modid; +# endif + break; + case R_386_TLS_DTPOFF32: +# ifndef RTLD_BOOTSTRAP + /* During relocation all TLS symbols are defined and used. + Therefore the offset is already correct. */ + if (sym != NULL) + *reloc_addr = sym->st_value; +# endif + break; + case R_386_TLS_DESC: + { + struct tlsdesc volatile *td = + (struct tlsdesc volatile *)reloc_addr; + +# ifndef RTLD_BOOTSTRAP + if (! sym) + td->entry = _dl_tlsdesc_undefweak; + else +# endif + { +# ifndef RTLD_BOOTSTRAP +# ifndef SHARED + CHECK_STATIC_TLS (map, sym_map); +# else + if (!TRY_STATIC_TLS (map, sym_map)) + { + td->arg = _dl_make_tlsdesc_dynamic + (sym_map, sym->st_value + (ElfW(Word))td->arg); + td->entry = _dl_tlsdesc_dynamic; + } + else +# endif +# endif + { + td->arg = (void*)(sym->st_value - sym_map->l_tls_offset + + (ElfW(Word))td->arg); + td->entry = _dl_tlsdesc_return; + } + } + break; + } + case R_386_TLS_TPOFF32: + /* The offset is positive, backward from the thread pointer. */ +# ifdef RTLD_BOOTSTRAP + *reloc_addr += map->l_tls_offset - sym->st_value; +# else + /* We know the offset of object the symbol is contained in. + It is a positive value which will be subtracted from the + thread pointer. To get the variable position in the TLS + block we subtract the offset from that of the TLS block. */ + if (sym != NULL) + { + CHECK_STATIC_TLS (map, sym_map); + *reloc_addr += sym_map->l_tls_offset - sym->st_value; + } +# endif + break; + case R_386_TLS_TPOFF: + /* The offset is negative, forward from the thread pointer. */ +# ifdef RTLD_BOOTSTRAP + *reloc_addr += sym->st_value - map->l_tls_offset; +# else + /* We know the offset of object the symbol is contained in. + It is a negative value which will be added to the + thread pointer. */ + if (sym != NULL) + { + CHECK_STATIC_TLS (map, sym_map); + *reloc_addr += sym->st_value - sym_map->l_tls_offset; + } +# endif + break; + +# ifndef RTLD_BOOTSTRAP + case R_386_32: + *reloc_addr += value; + break; + case R_386_PC32: + *reloc_addr += (value - (Elf32_Addr) reloc_addr); + break; + case R_386_COPY: + if (sym == NULL) + /* This can happen in trace mode if an object could not be + found. */ + break; + if (__builtin_expect (sym->st_size > refsym->st_size, 0) + || (__builtin_expect (sym->st_size < refsym->st_size, 0) + && GLRO(dl_verbose))) + { + const char *strtab; + + strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]); + _dl_error_printf ("\ +%s: Symbol `%s' has different size in shared object, consider re-linking\n", + RTLD_PROGNAME, strtab + refsym->st_name); + } + memcpy (reloc_addr_arg, (void *) value, + MIN (sym->st_size, refsym->st_size)); + break; + case R_386_IRELATIVE: + value = map->l_addr + *reloc_addr; + value = ((Elf32_Addr (*) (void)) value) (); + *reloc_addr = value; + break; + default: + _dl_reloc_bad_type (map, r_type, 0); + break; +# endif /* !RTLD_BOOTSTRAP */ + } + } +} + +# ifndef RTLD_BOOTSTRAP +auto inline void +__attribute__ ((always_inline)) +elf_machine_rela (struct link_map *map, const Elf32_Rela *reloc, + const Elf32_Sym *sym, const struct r_found_version *version, + void *const reloc_addr_arg, int skip_ifunc) +{ + Elf32_Addr *const reloc_addr = reloc_addr_arg; + const unsigned int r_type = ELF32_R_TYPE (reloc->r_info); + + if (ELF32_R_TYPE (reloc->r_info) == R_386_RELATIVE) + *reloc_addr = map->l_addr + reloc->r_addend; + else if (r_type != R_386_NONE) + { +# ifndef RESOLVE_CONFLICT_FIND_MAP + const Elf32_Sym *const refsym = sym; +# endif + struct link_map *sym_map = RESOLVE_MAP (&sym, version, r_type); + Elf32_Addr value = sym == NULL ? 0 : sym_map->l_addr + sym->st_value; + + if (sym != NULL + && __builtin_expect (sym->st_shndx != SHN_UNDEF, 1) + && __builtin_expect (ELFW(ST_TYPE) (sym->st_info) == STT_GNU_IFUNC, 0) + && __builtin_expect (!skip_ifunc, 1)) + value = ((Elf32_Addr (*) (void)) value) (); + + switch (ELF32_R_TYPE (reloc->r_info)) + { + case R_386_SIZE32: + /* Set to symbol size plus addend. */ + value = sym->st_size; + case R_386_GLOB_DAT: + case R_386_JMP_SLOT: + case R_386_32: + *reloc_addr = value + reloc->r_addend; + break; +# ifndef RESOLVE_CONFLICT_FIND_MAP + /* Not needed for dl-conflict.c. */ + case R_386_PC32: + *reloc_addr = (value + reloc->r_addend - (Elf32_Addr) reloc_addr); + break; + + case R_386_TLS_DTPMOD32: + /* Get the information from the link map returned by the + resolv function. */ + if (sym_map != NULL) + *reloc_addr = sym_map->l_tls_modid; + break; + case R_386_TLS_DTPOFF32: + /* During relocation all TLS symbols are defined and used. + Therefore the offset is already correct. */ + *reloc_addr = (sym == NULL ? 0 : sym->st_value) + reloc->r_addend; + break; + case R_386_TLS_DESC: + { + struct tlsdesc volatile *td = + (struct tlsdesc volatile *)reloc_addr; + +# ifndef RTLD_BOOTSTRAP + if (!sym) + { + td->arg = (void*)reloc->r_addend; + td->entry = _dl_tlsdesc_undefweak; + } + else +# endif + { +# ifndef RTLD_BOOTSTRAP +# ifndef SHARED + CHECK_STATIC_TLS (map, sym_map); +# else + if (!TRY_STATIC_TLS (map, sym_map)) + { + td->arg = _dl_make_tlsdesc_dynamic + (sym_map, sym->st_value + reloc->r_addend); + td->entry = _dl_tlsdesc_dynamic; + } + else +# endif +# endif + { + td->arg = (void*)(sym->st_value - sym_map->l_tls_offset + + reloc->r_addend); + td->entry = _dl_tlsdesc_return; + } + } + } + break; + case R_386_TLS_TPOFF32: + /* The offset is positive, backward from the thread pointer. */ + /* We know the offset of object the symbol is contained in. + It is a positive value which will be subtracted from the + thread pointer. To get the variable position in the TLS + block we subtract the offset from that of the TLS block. */ + if (sym != NULL) + { + CHECK_STATIC_TLS (map, sym_map); + *reloc_addr = sym_map->l_tls_offset - sym->st_value + + reloc->r_addend; + } + break; + case R_386_TLS_TPOFF: + /* The offset is negative, forward from the thread pointer. */ + /* We know the offset of object the symbol is contained in. + It is a negative value which will be added to the + thread pointer. */ + if (sym != NULL) + { + CHECK_STATIC_TLS (map, sym_map); + *reloc_addr = sym->st_value - sym_map->l_tls_offset + + reloc->r_addend; + } + break; + case R_386_COPY: + if (sym == NULL) + /* This can happen in trace mode if an object could not be + found. */ + break; + if (__builtin_expect (sym->st_size > refsym->st_size, 0) + || (__builtin_expect (sym->st_size < refsym->st_size, 0) + && GLRO(dl_verbose))) + { + const char *strtab; + + strtab = (const char *) D_PTR (map, l_info[DT_STRTAB]); + _dl_error_printf ("\ +%s: Symbol `%s' has different size in shared object, consider re-linking\n", + RTLD_PROGNAME, strtab + refsym->st_name); + } + memcpy (reloc_addr_arg, (void *) value, + MIN (sym->st_size, refsym->st_size)); + break; +# endif /* !RESOLVE_CONFLICT_FIND_MAP */ + case R_386_IRELATIVE: + value = map->l_addr + reloc->r_addend; + value = ((Elf32_Addr (*) (void)) value) (); + *reloc_addr = value; + break; + default: + /* We add these checks in the version to relocate ld.so only + if we are still debugging. */ + _dl_reloc_bad_type (map, r_type, 0); + break; + } + } +} +# endif /* !RTLD_BOOTSTRAP */ + +auto inline void +__attribute ((always_inline)) +elf_machine_rel_relative (Elf32_Addr l_addr, const Elf32_Rel *reloc, + void *const reloc_addr_arg) +{ + Elf32_Addr *const reloc_addr = reloc_addr_arg; + assert (ELF32_R_TYPE (reloc->r_info) == R_386_RELATIVE); + *reloc_addr += l_addr; +} + +# ifndef RTLD_BOOTSTRAP +auto inline void +__attribute__ ((always_inline)) +elf_machine_rela_relative (Elf32_Addr l_addr, const Elf32_Rela *reloc, + void *const reloc_addr_arg) +{ + Elf32_Addr *const reloc_addr = reloc_addr_arg; + *reloc_addr = l_addr + reloc->r_addend; +} +# endif /* !RTLD_BOOTSTRAP */ + +auto inline void +__attribute__ ((always_inline)) +elf_machine_lazy_rel (struct link_map *map, + Elf32_Addr l_addr, const Elf32_Rel *reloc, + int skip_ifunc) +{ + Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset); + const unsigned int r_type = ELF32_R_TYPE (reloc->r_info); + /* Check for unexpected PLT reloc type. */ + if (__glibc_likely (r_type == R_386_JMP_SLOT)) + { + if (__builtin_expect (map->l_mach.plt, 0) == 0) + *reloc_addr += l_addr; + else + *reloc_addr = (map->l_mach.plt + + (((Elf32_Addr) reloc_addr) - map->l_mach.gotplt) * 4); + } + else if (__glibc_likely (r_type == R_386_TLS_DESC)) + { + struct tlsdesc volatile * __attribute__((__unused__)) td = + (struct tlsdesc volatile *)reloc_addr; + + /* Handle relocations that reference the local *ABS* in a simple + way, so as to preserve a potential addend. */ + if (ELF32_R_SYM (reloc->r_info) == 0) + td->entry = _dl_tlsdesc_resolve_abs_plus_addend; + /* Given a known-zero addend, we can store a pointer to the + reloc in the arg position. */ + else if (td->arg == 0) + { + td->arg = (void*)reloc; + td->entry = _dl_tlsdesc_resolve_rel; + } + else + { + /* We could handle non-*ABS* relocations with non-zero addends + by allocating dynamically an arg to hold a pointer to the + reloc, but that sounds pointless. */ + const Elf32_Rel *const r = reloc; + /* The code below was borrowed from elf_dynamic_do_rel(). */ + const ElfW(Sym) *const symtab = + (const void *) D_PTR (map, l_info[DT_SYMTAB]); + +# ifdef RTLD_BOOTSTRAP + /* The dynamic linker always uses versioning. */ + assert (map->l_info[VERSYMIDX (DT_VERSYM)] != NULL); +# else + if (map->l_info[VERSYMIDX (DT_VERSYM)]) +# endif + { + const ElfW(Half) *const version = + (const void *) D_PTR (map, l_info[VERSYMIDX (DT_VERSYM)]); + ElfW(Half) ndx = version[ELFW(R_SYM) (r->r_info)] & 0x7fff; + elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)], + &map->l_versions[ndx], + (void *) (l_addr + r->r_offset), skip_ifunc); + } +# ifndef RTLD_BOOTSTRAP + else + elf_machine_rel (map, r, &symtab[ELFW(R_SYM) (r->r_info)], NULL, + (void *) (l_addr + r->r_offset), skip_ifunc); +# endif + } + } + else if (__glibc_unlikely (r_type == R_386_IRELATIVE)) + { + Elf32_Addr value = map->l_addr + *reloc_addr; + if (__glibc_likely (!skip_ifunc)) + value = ((Elf32_Addr (*) (void)) value) (); + *reloc_addr = value; + } + else + _dl_reloc_bad_type (map, r_type, 1); +} + +# ifndef RTLD_BOOTSTRAP + +auto inline void +__attribute__ ((always_inline)) +elf_machine_lazy_rela (struct link_map *map, + Elf32_Addr l_addr, const Elf32_Rela *reloc, + int skip_ifunc) +{ + Elf32_Addr *const reloc_addr = (void *) (l_addr + reloc->r_offset); + const unsigned int r_type = ELF32_R_TYPE (reloc->r_info); + if (__glibc_likely (r_type == R_386_JMP_SLOT)) + ; + else if (__glibc_likely (r_type == R_386_TLS_DESC)) + { + struct tlsdesc volatile * __attribute__((__unused__)) td = + (struct tlsdesc volatile *)reloc_addr; + + td->arg = (void*)reloc; + td->entry = _dl_tlsdesc_resolve_rela; + } + else if (__glibc_unlikely (r_type == R_386_IRELATIVE)) + { + Elf32_Addr value = map->l_addr + reloc->r_addend; + if (__glibc_likely (!skip_ifunc)) + value = ((Elf32_Addr (*) (void)) value) (); + *reloc_addr = value; + } + else + _dl_reloc_bad_type (map, r_type, 1); +} + +# endif /* !RTLD_BOOTSTRAP */ + +#endif /* RESOLVE_MAP */ diff --git a/REORG.TODO/sysdeps/i386/dl-procinfo.c b/REORG.TODO/sysdeps/i386/dl-procinfo.c new file mode 100644 index 0000000000..7237f778b2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/dl-procinfo.c @@ -0,0 +1,65 @@ +/* Data for i386 version of processor capability information. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2001. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* If anything should be added here check whether the size of each string + is still ok with the given array size. + + All the #ifdefs in the definitions are quite irritating but + necessary if we want to avoid duplicating the information. There + are three different modes: + + - PROCINFO_DECL is defined. This means we are only interested in + declarations. + + - PROCINFO_DECL is not defined: + + + if SHARED is defined the file is included in an array + initializer. The .element = { ... } syntax is needed. + + + if SHARED is not defined a normal array initialization is + needed. + */ + +#ifndef PROCINFO_CLASS +# define PROCINFO_CLASS +#endif + +#include <sysdeps/x86/dl-procinfo.c> + +#if !defined PROCINFO_DECL && defined SHARED + ._dl_x86_cap_flags +#else +PROCINFO_CLASS const char _dl_x86_cap_flags[32][8] +#endif +#ifndef PROCINFO_DECL += { + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", + "cx8", "apic", "10", "sep", "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", "20", "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe" + } +#endif +#if !defined SHARED || defined PROCINFO_DECL +; +#else +, +#endif + +#undef PROCINFO_DECL +#undef PROCINFO_CLASS diff --git a/REORG.TODO/sysdeps/i386/dl-tls.h b/REORG.TODO/sysdeps/i386/dl-tls.h new file mode 100644 index 0000000000..525ebab992 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/dl-tls.h @@ -0,0 +1,61 @@ +/* Thread-local storage handling in the ELF dynamic linker. i386 version. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +/* Type used for the representation of TLS information in the GOT. */ +typedef struct dl_tls_index +{ + unsigned long int ti_module; + unsigned long int ti_offset; +} tls_index; + + +#ifdef SHARED +/* This is the prototype for the GNU version. */ +extern void *___tls_get_addr (tls_index *ti) + __attribute__ ((__regparm__ (1))); +extern void *___tls_get_addr_internal (tls_index *ti) + __attribute__ ((__regparm__ (1))) attribute_hidden; + +# if IS_IN (rtld) +/* The special thing about the x86 TLS ABI is that we have two + variants of the __tls_get_addr function with different calling + conventions. The GNU version, which we are mostly concerned here, + takes the parameter in a register. The name is changed by adding + an additional underscore at the beginning. The Sun version uses + the normal calling convention. */ +void * +__tls_get_addr (tls_index *ti) +{ + return ___tls_get_addr_internal (ti); +} + + +/* Prepare using the definition of __tls_get_addr in the generic + version of this file. */ +# define __tls_get_addr __attribute__ ((__regparm__ (1))) ___tls_get_addr +strong_alias (___tls_get_addr, ___tls_get_addr_internal) +rtld_hidden_proto (___tls_get_addr) +rtld_hidden_def (___tls_get_addr) +#else + +/* Users should get the better interface. */ +# define __tls_get_addr ___tls_get_addr + +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/dl-tlsdesc.S b/REORG.TODO/sysdeps/i386/dl-tlsdesc.S new file mode 100644 index 0000000000..8befdc2b39 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/dl-tlsdesc.S @@ -0,0 +1,285 @@ +/* Thread-local storage handling in the ELF dynamic linker. i386 version. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <tls.h> +#include "tlsdesc.h" + + .text + + /* This function is used to compute the TP offset for symbols in + Static TLS, i.e., whose TP offset is the same for all + threads. + + The incoming %eax points to the TLS descriptor, such that + 0(%eax) points to _dl_tlsdesc_return itself, and 4(%eax) holds + the TP offset of the symbol corresponding to the object + denoted by the argument. */ + + .hidden _dl_tlsdesc_return + .global _dl_tlsdesc_return + .type _dl_tlsdesc_return,@function + cfi_startproc + .align 16 +_dl_tlsdesc_return: + movl 4(%eax), %eax + ret + cfi_endproc + .size _dl_tlsdesc_return, .-_dl_tlsdesc_return + + /* This function is used for undefined weak TLS symbols, for + which the base address (i.e., disregarding any addend) should + resolve to NULL. + + %eax points to the TLS descriptor, such that 0(%eax) points to + _dl_tlsdesc_undefweak itself, and 4(%eax) holds the addend. + We return the addend minus the TP, such that, when the caller + adds TP, it gets the addend back. If that's zero, as usual, + that's most likely a NULL pointer. */ + + .hidden _dl_tlsdesc_undefweak + .global _dl_tlsdesc_undefweak + .type _dl_tlsdesc_undefweak,@function + cfi_startproc + .align 16 +_dl_tlsdesc_undefweak: + movl 4(%eax), %eax + subl %gs:0, %eax + ret + cfi_endproc + .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak + +#ifdef SHARED + .hidden _dl_tlsdesc_dynamic + .global _dl_tlsdesc_dynamic + .type _dl_tlsdesc_dynamic,@function + + /* This function is used for symbols that need dynamic TLS. + + %eax points to the TLS descriptor, such that 0(%eax) points to + _dl_tlsdesc_dynamic itself, and 4(%eax) points to a struct + tlsdesc_dynamic_arg object. It must return in %eax the offset + between the thread pointer and the object denoted by the + argument, without clobbering any registers. + + The assembly code that follows is a rendition of the following + C code, hand-optimized a little bit. + +ptrdiff_t +__attribute__ ((__regparm__ (1))) +_dl_tlsdesc_dynamic (struct tlsdesc *tdp) +{ + struct tlsdesc_dynamic_arg *td = tdp->arg; + dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET); + if (__builtin_expect (td->gen_count <= dtv[0].counter + && (dtv[td->tlsinfo.ti_module].pointer.val + != TLS_DTV_UNALLOCATED), + 1)) + return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset + - __thread_pointer; + + return ___tls_get_addr (&td->tlsinfo) - __thread_pointer; +} +*/ + cfi_startproc + .align 16 +_dl_tlsdesc_dynamic: + /* Like all TLS resolvers, preserve call-clobbered registers. + We need two scratch regs anyway. */ + subl $28, %esp + cfi_adjust_cfa_offset (28) + movl %ecx, 20(%esp) + movl %edx, 24(%esp) + movl TLSDESC_ARG(%eax), %eax + movl %gs:DTV_OFFSET, %edx + movl TLSDESC_GEN_COUNT(%eax), %ecx + cmpl (%edx), %ecx + ja .Lslow + movl TLSDESC_MODID(%eax), %ecx + movl (%edx,%ecx,8), %edx + cmpl $-1, %edx + je .Lslow + movl TLSDESC_MODOFF(%eax), %eax + addl %edx, %eax +.Lret: + movl 20(%esp), %ecx + subl %gs:0, %eax + movl 24(%esp), %edx + addl $28, %esp + cfi_adjust_cfa_offset (-28) + ret + .p2align 4,,7 +.Lslow: + cfi_adjust_cfa_offset (28) + call HIDDEN_JUMPTARGET (___tls_get_addr) + jmp .Lret + cfi_endproc + .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic +#endif /* SHARED */ + + /* This function is a wrapper for a lazy resolver for TLS_DESC + REL relocations that reference the *ABS* segment in their own + link maps. %ebx points to the caller's GOT. %eax points to a + TLS descriptor, such that 0(%eax) holds the address of the + resolver wrapper itself (unless some other thread beat us to + it) and 4(%eax) holds the addend in the relocation. + + When the actual resolver returns, it will have adjusted the + TLS descriptor such that we can tail-call it for it to return + the TP offset of the symbol. */ + + .hidden _dl_tlsdesc_resolve_abs_plus_addend + .global _dl_tlsdesc_resolve_abs_plus_addend + .type _dl_tlsdesc_resolve_abs_plus_addend,@function + cfi_startproc + .align 16 +_dl_tlsdesc_resolve_abs_plus_addend: +0: + pushl %eax + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %edx + cfi_adjust_cfa_offset (4) + movl $1f - 0b, %ecx + movl 4(%ebx), %edx + call _dl_tlsdesc_resolve_abs_plus_addend_fixup +1: + popl %edx + cfi_adjust_cfa_offset (-4) + popl %ecx + cfi_adjust_cfa_offset (-4) + popl %eax + cfi_adjust_cfa_offset (-4) + jmp *(%eax) + cfi_endproc + .size _dl_tlsdesc_resolve_abs_plus_addend, .-_dl_tlsdesc_resolve_abs_plus_addend + + /* This function is a wrapper for a lazy resolver for TLS_DESC + REL relocations that had zero addends. %ebx points to the + caller's GOT. %eax points to a TLS descriptor, such that + 0(%eax) holds the address of the resolver wrapper itself + (unless some other thread beat us to it) and 4(%eax) holds a + pointer to the relocation. + + When the actual resolver returns, it will have adjusted the + TLS descriptor such that we can tail-call it for it to return + the TP offset of the symbol. */ + + .hidden _dl_tlsdesc_resolve_rel + .global _dl_tlsdesc_resolve_rel + .type _dl_tlsdesc_resolve_rel,@function + cfi_startproc + .align 16 +_dl_tlsdesc_resolve_rel: +0: + pushl %eax + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %edx + cfi_adjust_cfa_offset (4) + movl $1f - 0b, %ecx + movl 4(%ebx), %edx + call _dl_tlsdesc_resolve_rel_fixup +1: + popl %edx + cfi_adjust_cfa_offset (-4) + popl %ecx + cfi_adjust_cfa_offset (-4) + popl %eax + cfi_adjust_cfa_offset (-4) + jmp *(%eax) + cfi_endproc + .size _dl_tlsdesc_resolve_rel, .-_dl_tlsdesc_resolve_rel + + /* This function is a wrapper for a lazy resolver for TLS_DESC + RELA relocations. %ebx points to the caller's GOT. %eax + points to a TLS descriptor, such that 0(%eax) holds the + address of the resolver wrapper itself (unless some other + thread beat us to it) and 4(%eax) holds a pointer to the + relocation. + + When the actual resolver returns, it will have adjusted the + TLS descriptor such that we can tail-call it for it to return + the TP offset of the symbol. */ + + .hidden _dl_tlsdesc_resolve_rela + .global _dl_tlsdesc_resolve_rela + .type _dl_tlsdesc_resolve_rela,@function + cfi_startproc + .align 16 +_dl_tlsdesc_resolve_rela: +0: + pushl %eax + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %edx + cfi_adjust_cfa_offset (4) + movl $1f - 0b, %ecx + movl 4(%ebx), %edx + call _dl_tlsdesc_resolve_rela_fixup +1: + popl %edx + cfi_adjust_cfa_offset (-4) + popl %ecx + cfi_adjust_cfa_offset (-4) + popl %eax + cfi_adjust_cfa_offset (-4) + jmp *(%eax) + cfi_endproc + .size _dl_tlsdesc_resolve_rela, .-_dl_tlsdesc_resolve_rela + + /* This function is a placeholder for lazy resolving of TLS + relocations. Once some thread starts resolving a TLS + relocation, it sets up the TLS descriptor to use this + resolver, such that other threads that would attempt to + resolve it concurrently may skip the call to the original lazy + resolver and go straight to a condition wait. + + When the actual resolver returns, it will have adjusted the + TLS descriptor such that we can tail-call it for it to return + the TP offset of the symbol. */ + + .hidden _dl_tlsdesc_resolve_hold + .global _dl_tlsdesc_resolve_hold + .type _dl_tlsdesc_resolve_hold,@function + cfi_startproc + .align 16 +_dl_tlsdesc_resolve_hold: +0: + pushl %eax + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %edx + cfi_adjust_cfa_offset (4) + movl $1f - 0b, %ecx + movl 4(%ebx), %edx + call _dl_tlsdesc_resolve_hold_fixup +1: + popl %edx + cfi_adjust_cfa_offset (-4) + popl %ecx + cfi_adjust_cfa_offset (-4) + popl %eax + cfi_adjust_cfa_offset (-4) + jmp *(%eax) + cfi_endproc + .size _dl_tlsdesc_resolve_hold, .-_dl_tlsdesc_resolve_hold diff --git a/REORG.TODO/sysdeps/i386/dl-tlsdesc.h b/REORG.TODO/sysdeps/i386/dl-tlsdesc.h new file mode 100644 index 0000000000..242bebfc8e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/dl-tlsdesc.h @@ -0,0 +1,61 @@ +/* Thread-local storage descriptor handling in the ELF dynamic linker. + i386 version. + Copyright (C) 2005-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _I386_DL_TLSDESC_H +# define _I386_DL_TLSDESC_H 1 + +/* Type used to represent a TLS descriptor in the GOT. */ +struct tlsdesc +{ + ptrdiff_t __attribute__ ((regparm (1))) (*entry) (struct tlsdesc *); + void *arg; +}; + +typedef struct dl_tls_index +{ + unsigned long int ti_module; + unsigned long int ti_offset; +} tls_index; + +/* Type used as the argument in a TLS descriptor for a symbol that + needs dynamic TLS offsets. */ +struct tlsdesc_dynamic_arg +{ + tls_index tlsinfo; + size_t gen_count; +}; + +extern ptrdiff_t attribute_hidden __attribute__ ((regparm (1))) + _dl_tlsdesc_return (struct tlsdesc *), + _dl_tlsdesc_undefweak (struct tlsdesc *), + _dl_tlsdesc_resolve_abs_plus_addend (struct tlsdesc *), + _dl_tlsdesc_resolve_rel (struct tlsdesc *), + _dl_tlsdesc_resolve_rela (struct tlsdesc *), + _dl_tlsdesc_resolve_hold (struct tlsdesc *); + +# ifdef SHARED +extern void *_dl_make_tlsdesc_dynamic (struct link_map *map, + size_t ti_offset) + internal_function attribute_hidden; + +extern ptrdiff_t attribute_hidden __attribute__ ((regparm (1))) + _dl_tlsdesc_dynamic (struct tlsdesc *); +# endif + +#endif diff --git a/REORG.TODO/sysdeps/i386/dl-trampoline.S b/REORG.TODO/sysdeps/i386/dl-trampoline.S new file mode 100644 index 0000000000..6e7f3aef92 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/dl-trampoline.S @@ -0,0 +1,215 @@ +/* PLT trampolines. i386 version. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <link-defines.h> + +#ifdef HAVE_MPX_SUPPORT +# define PRESERVE_BND_REGS_PREFIX bnd +#else +# define PRESERVE_BND_REGS_PREFIX .byte 0xf2 +#endif + + .text + .globl _dl_runtime_resolve + .type _dl_runtime_resolve, @function + cfi_startproc + .align 16 +_dl_runtime_resolve: + cfi_adjust_cfa_offset (8) + pushl %eax # Preserve registers otherwise clobbered. + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %edx + cfi_adjust_cfa_offset (4) + movl 16(%esp), %edx # Copy args pushed by PLT in register. Note + movl 12(%esp), %eax # that `fixup' takes its parameters in regs. + call _dl_fixup # Call resolver. + popl %edx # Get register content back. + cfi_adjust_cfa_offset (-4) + movl (%esp), %ecx + movl %eax, (%esp) # Store the function address. + movl 4(%esp), %eax + ret $12 # Jump to function address. + cfi_endproc + .size _dl_runtime_resolve, .-_dl_runtime_resolve + + +#ifndef PROF + .globl _dl_runtime_profile + .type _dl_runtime_profile, @function + cfi_startproc + .align 16 +_dl_runtime_profile: + cfi_adjust_cfa_offset (8) + pushl %esp + cfi_adjust_cfa_offset (4) + addl $8, (%esp) # Account for the pushed PLT data + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %eax # Preserve registers otherwise clobbered. + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %edx + cfi_adjust_cfa_offset (4) + movl %esp, %ecx + subl $8, %esp + cfi_adjust_cfa_offset (8) + movl $-1, 4(%esp) + leal 4(%esp), %edx + movl %edx, (%esp) + pushl %ecx # Address of the register structure + cfi_adjust_cfa_offset (4) + movl 40(%esp), %ecx # Load return address + movl 36(%esp), %edx # Copy args pushed by PLT in register. Note + movl 32(%esp), %eax # that `fixup' takes its parameters in regs. + call _dl_profile_fixup # Call resolver. + cfi_adjust_cfa_offset (-8) + movl (%esp), %edx + testl %edx, %edx + jns 1f + popl %edx + cfi_adjust_cfa_offset (-4) + popl %edx # Get register content back. + cfi_adjust_cfa_offset (-4) + movl (%esp), %ecx + movl %eax, (%esp) # Store the function address. + movl 4(%esp), %eax + ret $20 # Jump to function address. + + /* + +32 return address + +28 PLT1 + +24 PLT2 + +20 %esp + +16 %ebp + +12 %eax + +8 %ecx + +4 %edx + %esp free + */ + cfi_adjust_cfa_offset (8) +1: movl %ebx, (%esp) + cfi_rel_offset (ebx, 0) + movl %edx, %ebx # This is the frame buffer size + pushl %edi + cfi_adjust_cfa_offset (4) + cfi_rel_offset (edi, 0) + pushl %esi + cfi_adjust_cfa_offset (4) + cfi_rel_offset (esi, 0) + leal 44(%esp), %esi + movl %ebx, %ecx + orl $4, %ebx # Increase frame size if necessary to align + # stack for the function call + andl $~3, %ebx + movl %esp, %edi + subl %ebx, %edi + movl %esp, %ebx + cfi_def_cfa_register (ebx) + movl %edi, %esp + shrl $2, %ecx + rep + movsl + movl (%ebx), %esi + cfi_restore (esi) + movl 4(%ebx), %edi + cfi_restore (edi) + /* + %ebx+40 return address + %ebx+36 PLT1 + %ebx+32 PLT2 + %ebx+28 %esp + %ebx+24 %ebp + %ebx+20 %eax + %ebx+16 %ecx + %ebx+12 %edx + %ebx+8 %ebx + %ebx+4 free + %ebx free + %esp copied stack frame + */ + movl %eax, (%ebx) + movl 12(%ebx), %edx + movl 16(%ebx), %ecx + movl 20(%ebx), %eax + call *(%ebx) + movl %ebx, %esp + cfi_def_cfa_register (esp) + movl 8(%esp), %ebx + cfi_restore (ebx) + /* + +40 return address + +36 PLT1 + +32 PLT2 + +28 %esp + +24 %ebp + +20 %eax + +16 %ecx + +12 %edx + +8 free + +4 free + %esp free + */ +#if LONG_DOUBLE_SIZE != 12 +# error "long double size must be 12 bytes" +#endif + # Allocate space for La_i86_retval and subtract 12 free bytes. + subl $(LRV_SIZE - 12), %esp + cfi_adjust_cfa_offset (LRV_SIZE - 12) + movl %eax, LRV_EAX_OFFSET(%esp) + movl %edx, LRV_EDX_OFFSET(%esp) + fstpt LRV_ST0_OFFSET(%esp) + fstpt LRV_ST1_OFFSET(%esp) +#ifdef HAVE_MPX_SUPPORT + bndmov %bnd0, LRV_BND0_OFFSET(%esp) + bndmov %bnd1, LRV_BND1_OFFSET(%esp) +#else + .byte 0x66,0x0f,0x1b,0x44,0x24,LRV_BND0_OFFSET + .byte 0x66,0x0f,0x1b,0x4c,0x24,LRV_BND1_OFFSET +#endif + pushl %esp + cfi_adjust_cfa_offset (4) + # Address of La_i86_regs area. + leal (LRV_SIZE + 4)(%esp), %ecx + # PLT2 + movl (LRV_SIZE + 4 + LR_SIZE)(%esp), %eax + # PLT1 + movl (LRV_SIZE + 4 + LR_SIZE + 4)(%esp), %edx + call _dl_call_pltexit + movl LRV_EAX_OFFSET(%esp), %eax + movl LRV_EDX_OFFSET(%esp), %edx + fldt LRV_ST1_OFFSET(%esp) + fldt LRV_ST0_OFFSET(%esp) +#ifdef HAVE_MPX_SUPPORT + bndmov LRV_BND0_OFFSET(%esp), %bnd0 + bndmov LRV_BND1_OFFSET(%esp), %bnd1 +#else + .byte 0x66,0x0f,0x1a,0x44,0x24,LRV_BND0_OFFSET + .byte 0x66,0x0f,0x1a,0x4c,0x24,LRV_BND1_OFFSET +#endif + # Restore stack before return. + addl $(LRV_SIZE + 4 + LR_SIZE + 4), %esp + cfi_adjust_cfa_offset (-(LRV_SIZE + 4 + LR_SIZE + 4)) + PRESERVE_BND_REGS_PREFIX + ret + cfi_endproc + .size _dl_runtime_profile, .-_dl_runtime_profile +#endif diff --git a/REORG.TODO/sysdeps/i386/ffs.c b/REORG.TODO/sysdeps/i386/ffs.c new file mode 100644 index 0000000000..c229c8166e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/ffs.c @@ -0,0 +1,50 @@ +/* ffs -- find first set bit in a word, counted from least significant end. + For Intel 80x86, x>=3. + This file is part of the GNU C Library. + Copyright (C) 1991-2017 Free Software Foundation, Inc. + Contributed by Torbjorn Granlund (tege@sics.se). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define ffsl __something_else +#include <string.h> + +#undef ffs + +#ifdef __GNUC__ + +int +__ffs (int x) +{ + int cnt; + int tmp; + + asm ("xorl %0,%0\n" /* Set CNT to zero. */ + "bsfl %2,%1\n" /* Count low bits in X and store in %1. */ + "jz 1f\n" /* Jump if OK, i.e. X was non-zero. */ + "leal 1(%1),%0\n" /* Return bsfl-result plus one on %0. */ + "1:" : "=&a" (cnt), "=r" (tmp) : "rm" (x)); + + return cnt; +} +weak_alias (__ffs, ffs) +libc_hidden_def (__ffs) +libc_hidden_builtin_def (ffs) +#undef ffsl +weak_alias (__ffs, ffsl) + +#else +#include <string/ffs.c> +#endif diff --git a/REORG.TODO/sysdeps/i386/fpu/Implies b/REORG.TODO/sysdeps/i386/fpu/Implies new file mode 100644 index 0000000000..2b745a34fb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/Implies @@ -0,0 +1 @@ +x86/fpu diff --git a/REORG.TODO/sysdeps/i386/fpu/Versions b/REORG.TODO/sysdeps/i386/fpu/Versions new file mode 100644 index 0000000000..a2eec371f1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/Versions @@ -0,0 +1,6 @@ +libm { + GLIBC_2.2 { + # functions used in inline functions or macros + __expl; __expm1l; + } +} diff --git a/REORG.TODO/sysdeps/i386/fpu/doasin.c b/REORG.TODO/sysdeps/i386/fpu/doasin.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/doasin.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acos.S b/REORG.TODO/sysdeps/i386/fpu/e_acos.S new file mode 100644 index 0000000000..586c7fc406 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_acos.S @@ -0,0 +1,25 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: e_acos.S,v 1.4 1995/05/08 23:44:37 jtc Exp $") + +/* acos = atan (sqrt((1-x) (1+x)) / x) */ +ENTRY(__ieee754_acos) + fldl 4(%esp) /* x */ + fld %st /* x : x */ + fld1 /* 1 : x : x */ + fsubp /* 1 - x : x */ + fld1 /* 1 : 1 - x : x */ + fadd %st(2) /* 1 + x : 1 - x : x */ + fmulp /* 1 - x^2 : x */ + fsqrt /* sqrt (1 - x^2) : x */ + fabs + fxch %st(1) /* x : sqrt (1 - x^2) */ + fpatan /* atan (sqrt(1 - x^2) / x) */ + ret +END (__ieee754_acos) +strong_alias (__ieee754_acos, __acos_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosf.S b/REORG.TODO/sysdeps/i386/fpu/e_acosf.S new file mode 100644 index 0000000000..54930af8b2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_acosf.S @@ -0,0 +1,24 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: $") + +/* acos = atan (sqrt(1 - x^2) / x) */ +ENTRY(__ieee754_acosf) + flds 4(%esp) /* x */ + fld %st + fmul %st(0) /* x^2 */ + fld1 + fsubp /* 1 - x^2 */ + fsqrt /* sqrt (1 - x^2) */ + fabs + fxch %st(1) + fpatan + ret +END (__ieee754_acosf) +strong_alias (__ieee754_acosf, __acosf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosh.S b/REORG.TODO/sysdeps/i386/fpu/e_acosh.S new file mode 100644 index 0000000000..9555ef8078 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_acosh.S @@ -0,0 +1,101 @@ +/* ix87 specific implementation of arcsinh. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__ieee754_acosh) + movl 8(%esp), %ecx + cmpl $0x3ff00000, %ecx + jl 5f // < 1 => invalid + fldln2 // log(2) + fldl 4(%esp) // x : log(2) + cmpl $0x41b00000, %ecx + ja 3f // x > 2^28 +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + cmpl $0x40000000, %ecx + ja 4f // x > 2 + + // 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2)) + fsubl MO(one) // x-1 : log(2) + fabs // acosh(1) is +0 in all rounding modes + fld %st // x-1 : x-1 : log(2) + fmul %st(1) // (x-1)^2 : x-1 : log(2) + fadd %st(1) // x-1+(x-1)^2 : x-1 : log(2) + fadd %st(1) // 2*(x-1)+(x-1)^2 : x-1 : log(2) + fsqrt // sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2) + faddp // x-1+sqrt(2*(x-1)+(x-1)^2) : log(2) + fcoml MO(limit) + fnstsw + sahf + ja 2f + fyl2xp1 // log1p(x-1+sqrt(2*(x-1)+(x-1)^2)) + ret + +2: faddl MO(one) // x+sqrt(2*(x-1)+(x-1)^2) : log(2) + fyl2x // log(x+sqrt(2*(x-1)+(x-1)^2)) + ret + + // x > 2^28 => y = log(x) + log(2) + .align ALIGNARG(4) +3: fyl2x // log(x) + fldln2 // log(2) : log(x) + faddp // log(x)+log(2) + ret + + // 2^28 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1))) + .align ALIGNARG(4) +4: fld %st // x : x : log(2) + fadd %st, %st(1) // x : 2*x : log(2) + fld %st // x : x : 2*x : log(2) + fmul %st(1) // x^2 : x : 2*x : log(2) + fsubl MO(one) // x^2-1 : x : 2*x : log(2) + fsqrt // sqrt(x^2-1) : x : 2*x : log(2) + faddp // x+sqrt(x^2-1) : 2*x : log(2) + fdivrl MO(one) // 1/(x+sqrt(x^2-1)) : 2*x : log(2) + fsubrp // 2*x+1/(x+sqrt(x^2)-1) : log(2) + fyl2x // log(2*x+1/(x+sqrt(x^2-1))) + ret + + // x < 1 (or -NaN) => NaN + .align ALIGNARG(4) +5: fldl 4(%esp) + fsub %st + fdiv %st, %st(0) + ret +END(__ieee754_acosh) +strong_alias (__ieee754_acosh, __acosh_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S b/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S new file mode 100644 index 0000000000..662fda3c06 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_acoshf.S @@ -0,0 +1,101 @@ +/* ix87 specific implementation of arcsinh. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__ieee754_acoshf) + movl 4(%esp), %ecx + cmpl $0x3f800000, %ecx + jl 5f // < 1 => invalid + fldln2 // log(2) + flds 4(%esp) // x : log(2) + cmpl $0x47000000, %ecx + ja 3f // x > 2^14 +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + cmpl $0x40000000, %ecx + ja 4f // x > 2 + + // 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2)) + fsubl MO(one) // x-1 : log(2) + fabs // acosh(1) is +0 in all rounding modes + fld %st // x-1 : x-1 : log(2) + fmul %st(1) // (x-1)^2 : x-1 : log(2) + fadd %st(1) // x-1+(x-1)^2 : x-1 : log(2) + fadd %st(1) // 2*(x-1)+(x-1)^2 : x-1 : log(2) + fsqrt // sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2) + faddp // x-1+sqrt(2*(x-1)+(x-1)^2) : log(2) + fcoml MO(limit) + fnstsw + sahf + ja 2f + fyl2xp1 // log1p(x-1+sqrt(2*(x-1)+(x-1)^2)) + ret + +2: faddl MO(one) // x+sqrt(2*(x-1)+(x-1)^2) : log(2) + fyl2x // log(x+sqrt(2*(x-1)+(x-1)^2)) + ret + + // x > 2^14 => y = log(x) + log(2) + .align ALIGNARG(4) +3: fyl2x // log(x) + fldln2 // log(2) : log(x) + faddp // log(x)+log(2) + ret + + // 2^28 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1))) + .align ALIGNARG(4) +4: fld %st // x : x : log(2) + fadd %st, %st(1) // x : 2*x : log(2) + fld %st // x : x : 2*x : log(2) + fmul %st(1) // x^2 : x : 2*x : log(2) + fsubl MO(one) // x^2-1 : x : 2*x : log(2) + fsqrt // sqrt(x^2-1) : x : 2*x : log(2) + faddp // x+sqrt(x^2-1) : 2*x : log(2) + fdivrl MO(one) // 1/(x+sqrt(x^2-1)) : 2*x : log(2) + fsubrp // 2*x+1/(x+sqrt(x^2)-1) : log(2) + fyl2x // log(2*x+1/(x+sqrt(x^2-1))) + ret + + // x < 1 (or -NaN) => NaN + .align ALIGNARG(4) +5: flds 4(%esp) + fsub %st + fdiv %st, %st(0) + ret +END(__ieee754_acoshf) +strong_alias (__ieee754_acoshf, __acoshf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S b/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S new file mode 100644 index 0000000000..e0d6466aac --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_acoshl.S @@ -0,0 +1,107 @@ +/* ix87 specific implementation of arcsinh. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + /* Please note that we use double value for 1.0. This number + has an exact representation and so we don't get accuracy + problems. The advantage is that the code is simpler. */ + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__ieee754_acoshl) + movl 12(%esp), %ecx + andl $0xffff, %ecx + cmpl $0x3fff, %ecx + jl 5f // < 1 => invalid + fldln2 // log(2) + fldt 4(%esp) // x : log(2) + cmpl $0x4020, %ecx + ja 3f // x > 2^34 +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + cmpl $0x4000, %ecx + ja 4f // x > 2 + + // 1 <= x <= 2 => y = log1p(x-1+sqrt(2*(x-1)+(x-1)^2)) + fsubl MO(one) // x-1 : log(2) + fabs // acosh(1) is +0 in all rounding modes + fld %st // x-1 : x-1 : log(2) + fmul %st(1) // (x-1)^2 : x-1 : log(2) + fadd %st(1) // x-1+(x-1)^2 : x-1 : log(2) + fadd %st(1) // 2*(x-1)+(x-1)^2 : x-1 : log(2) + fsqrt // sqrt(2*(x-1)+(x-1)^2) : x-1 : log(2) + faddp // x-1+sqrt(2*(x-1)+(x-1)^2) : log(2) + fcoml MO(limit) + fnstsw + sahf + ja 2f + fyl2xp1 // log1p(x-1+sqrt(2*(x-1)+(x-1)^2)) + ret + +2: faddl MO(one) // x+sqrt(2*(x-1)+(x-1)^2) : log(2) + fyl2x // log(x+sqrt(2*(x-1)+(x-1)^2)) + ret + + // x > 2^34 => y = log(x) + log(2) + .align ALIGNARG(4) +3: fyl2x // log(x) + fldln2 // log(2) : log(x) + faddp // log(x)+log(2) + ret + + // 2^34 > x > 2 => y = log(2*x - 1/(x+sqrt(x*x-1))) + .align ALIGNARG(4) +4: fld %st // x : x : log(2) + fadd %st, %st(1) // x : 2*x : log(2) + fld %st // x : x : 2*x : log(2) + fmul %st(1) // x^2 : x : 2*x : log(2) + fsubl MO(one) // x^2-1 : x : 2*x : log(2) + fsqrt // sqrt(x^2-1) : x : 2*x : log(2) + faddp // x+sqrt(x^2-1) : 2*x : log(2) + fdivrl MO(one) // 1/(x+sqrt(x^2-1)) : 2*x : log(2) + fsubrp // 2*x+1/(x+sqrt(x^2)-1) : log(2) + fyl2x // log(2*x+1/(x+sqrt(x^2-1))) + ret + + // x < 1 => NaN + .align ALIGNARG(4) +5: fldz + fdiv %st, %st(0) + ret +END(__ieee754_acoshl) +strong_alias (__ieee754_acoshl, __acoshl_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_acosl.c b/REORG.TODO/sysdeps/i386/fpu/e_acosl.c new file mode 100644 index 0000000000..ab08931924 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_acosl.c @@ -0,0 +1,29 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <math_private.h> + +long double +__ieee754_acosl (long double x) +{ + long double res; + + /* acosl = atanl (sqrtl((1-x) (1+x)) / x) */ + asm ( "fld %%st\n" + "fld1\n" + "fsubp\n" + "fld1\n" + "fadd %%st(2)\n" + "fmulp\n" /* 1 - x^2 */ + "fsqrt\n" /* sqrtl (1 - x^2) */ + "fabs\n" + "fxch %%st(1)\n" + "fpatan" + : "=t" (res) : "0" (x) : "st(1)"); + return res; +} +strong_alias (__ieee754_acosl, __acosl_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_asin.S b/REORG.TODO/sysdeps/i386/fpu/e_asin.S new file mode 100644 index 0000000000..39c8b47da4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_asin.S @@ -0,0 +1,38 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +RCSID("$NetBSD: e_asin.S,v 1.4 1995/05/08 23:45:40 jtc Exp $") + +DEFINE_DBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text + +/* asin = atan (x / sqrt((1-x) (1+x))) */ +ENTRY(__ieee754_asin) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldl 4(%esp) /* x */ + fld %st + fld1 /* 1 : x : x */ + fsubp /* 1 - x : x */ + fld1 /* 1 : 1 - x : x */ + fadd %st(2) /* 1 + x : 1 - x : x */ + fmulp /* 1 - x^2 */ + fsqrt /* sqrt (1 - x^2) */ + fpatan + DBL_CHECK_FORCE_UFLOW + ret +END (__ieee754_asin) +strong_alias (__ieee754_asin, __asin_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_asinf.S b/REORG.TODO/sysdeps/i386/fpu/e_asinf.S new file mode 100644 index 0000000000..1102bdedfd --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_asinf.S @@ -0,0 +1,39 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +RCSID("$NetBSD: $") + + .section .rodata.cst4,"aM",@progbits,4 + +DEFINE_FLT_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text + +/* asin = atan (x / sqrt(1 - x^2)) */ +ENTRY(__ieee754_asinf) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + flds 4(%esp) /* x */ + fld %st + fmul %st(0) /* x^2 */ + fld1 + fsubp /* 1 - x^2 */ + fsqrt /* sqrt (1 - x^2) */ + fpatan + FLT_CHECK_FORCE_UFLOW + ret +END (__ieee754_asinf) +strong_alias (__ieee754_asinf, __asinf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2.S b/REORG.TODO/sysdeps/i386/fpu/e_atan2.S new file mode 100644 index 0000000000..25f43bb5a1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2.S @@ -0,0 +1,30 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +RCSID("$NetBSD: e_atan2.S,v 1.4 1995/05/08 23:46:28 jtc Exp $") + +DEFINE_DBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_atan2) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldl 4(%esp) + fldl 12(%esp) + fpatan + DBL_CHECK_FORCE_UFLOW_NARROW + ret +END (__ieee754_atan2) +strong_alias (__ieee754_atan2, __atan2_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S b/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S new file mode 100644 index 0000000000..2bc909a762 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2f.S @@ -0,0 +1,30 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +RCSID("$NetBSD: e_atan2f.S,v 1.1 1995/05/08 23:35:10 jtc Exp $") + +DEFINE_FLT_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_atan2f) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + flds 4(%esp) + flds 8(%esp) + fpatan + FLT_CHECK_FORCE_UFLOW_NARROW + ret +END (__ieee754_atan2f) +strong_alias (__ieee754_atan2f, __atan2f_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c b/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c new file mode 100644 index 0000000000..9f88bfcc08 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_atan2l.c @@ -0,0 +1,19 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <math_private.h> + +long double +__ieee754_atan2l (long double y, long double x) +{ + long double res; + + asm ("fpatan" : "=t" (res) : "u" (y), "0" (x) : "st(1)"); + + return res; +} +strong_alias (__ieee754_atan2l, __atan2l_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanh.S b/REORG.TODO/sysdeps/i386/fpu/e_atanh.S new file mode 100644 index 0000000000..cbc93d5da2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_atanh.S @@ -0,0 +1,112 @@ +/* ix87 specific implementation of arctanh function. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type half,@object +half: .double 0.5 + ASM_SIZE_DIRECTIVE(half) + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + .type ln2_2,@object +ln2_2: .tfloat 0.3465735902799726547086160 + ASM_SIZE_DIRECTIVE(ln2_2) + +DEFINE_DBL_MIN + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__ieee754_atanh) + movl 8(%esp), %ecx + + movl %ecx, %eax + andl $0x7fffffff, %eax + cmpl $0x7ff00000, %eax + jae 5f +7: + +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + + andl $0x80000000, %ecx // ECX == 0 iff X >= 0 + + fldt MO(ln2_2) // 0.5*ln2 + xorl %ecx, 8(%esp) + fldl 4(%esp) // |x| : 0.5*ln2 + fcoml MO(half) // |x| : 0.5*ln2 + fld %st // |x| : |x| : 0.5*ln2 + fnstsw // |x| : |x| : 0.5*ln2 + sahf + jae 2f + fadd %st, %st(1) // |x| : 2*|x| : 0.5*ln2 + fld %st // |x| : |x| : 2*|x| : 0.5*ln2 + fsubrl MO(one) // 1-|x| : |x| : 2*|x| : 0.5*ln2 + fxch // |x| : 1-|x| : 2*|x| : 0.5*ln2 + fmul %st(2) // 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2 + fdivp // (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2 + faddp // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + fcoml MO(limit) // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + fnstsw // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + sahf + jae 4f + fyl2xp1 // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|)) + DBL_CHECK_FORCE_UFLOW_NONNEG + jecxz 3f + fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x)) +3: ret + + .align ALIGNARG(4) +4: faddl MO(one) // 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + fyl2x // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|)) + jecxz 3f + fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x)) +3: ret + + .align ALIGNARG(4) +2: faddl MO(one) // 1+|x| : |x| : 0.5*ln2 + fxch // |x| : 1+|x| : 0.5*ln2 + fsubrl MO(one) // 1-|x| : 1+|x| : 0.5*ln2 + fdivrp // (1+|x|)/(1-|x|) : 0.5*ln2 + fyl2x // 0.5*ln2*ld((1+|x|)/(1-|x|)) + jecxz 3f + fchs // 0.5*ln2*ld((1+x)/(1-x)) +3: ret + + // x == NaN or ±Inf +5: ja 6f + cmpl $0, 4(%esp) + je 7b +6: fldl 4(%esp) + ret +END(__ieee754_atanh) +strong_alias (__ieee754_atanh, __atanh_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S b/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S new file mode 100644 index 0000000000..92fda3fd82 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_atanhf.S @@ -0,0 +1,109 @@ +/* ix87 specific implementation of arctanh function. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type half,@object +half: .double 0.5 + ASM_SIZE_DIRECTIVE(half) + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + .align ALIGNARG(4) + .type ln2_2,@object +ln2_2: .tfloat 0.3465735902799726547086160 + ASM_SIZE_DIRECTIVE(ln2_2) + +DEFINE_FLT_MIN + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__ieee754_atanhf) + movl 4(%esp), %ecx + + movl %ecx, %eax + andl $0x7fffffff, %eax + cmpl $0x7f800000, %eax + ja 5f + +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + + andl $0x80000000, %ecx // ECX == 0 iff X >= 0 + + fldt MO(ln2_2) // 0.5*ln2 + xorl %ecx, 4(%esp) + flds 4(%esp) // |x| : 0.5*ln2 + fcoml MO(half) // |x| : 0.5*ln2 + fld %st(0) // |x| : |x| : 0.5*ln2 + fnstsw // |x| : |x| : 0.5*ln2 + sahf + jae 2f + fadd %st, %st(1) // |x| : 2*|x| : 0.5*ln2 + fld %st // |x| : |x| : 2*|x| : 0.5*ln2 + fsubrl MO(one) // 1-|x| : |x| : 2*|x| : 0.5*ln2 + fxch // |x| : 1-|x| : 2*|x| : 0.5*ln2 + fmul %st(2) // 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2 + fdivp // (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2 + faddp // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + fcoml MO(limit) // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + fnstsw // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + sahf + jae 4f + fyl2xp1 // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|)) + FLT_CHECK_FORCE_UFLOW_NONNEG + jecxz 3f + fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x)) +3: ret + + .align ALIGNARG(4) +4: faddl MO(one) // 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + fyl2x // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|)) + jecxz 3f + fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x)) +3: ret + + .align ALIGNARG(4) +2: faddl MO(one) // 1+|x| : |x| : 0.5*ln2 + fxch // |x| : 1+|x| : 0.5*ln2 + fsubrl MO(one) // 1-|x| : 1+|x| : 0.5*ln2 + fdivrp // (1+|x|)/(1-|x|) : 0.5*ln2 + fyl2x // 0.5*ln2*ld((1+|x|)/(1-|x|)) + jecxz 3f + fchs // 0.5*ln2*ld((1+x)/(1-x)) +3: ret + + // x == NaN +5: flds 4(%esp) + ret +END(__ieee754_atanhf) +strong_alias (__ieee754_atanhf, __atanhf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S b/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S new file mode 100644 index 0000000000..31ff7e5182 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_atanhl.S @@ -0,0 +1,127 @@ +/* ix87 specific implementation of arctanh function. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + /* Please note that we use double values for 0.5 and 1.0. These + numbers have exact representations and so we don't get accuracy + problems. The advantage is that the code is simpler. */ + .type half,@object +half: .double 0.5 + ASM_SIZE_DIRECTIVE(half) + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + .align ALIGNARG(4) + .type ln2_2,@object +ln2_2: .tfloat 0.3465735902799726547086160 + ASM_SIZE_DIRECTIVE(ln2_2) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__ieee754_atanhl) + movl 12(%esp), %ecx + + movl %ecx, %eax + andl $0x7fff, %eax + cmpl $0x7fff, %eax + je 5f + cmpl $0x3fdf, %eax + jge 7f + // Exponent below -32; return x, with underflow if subnormal. + fldt 4(%esp) + cmpl $0, %eax + jne 8f + fld %st(0) + fmul %st(0) + fstp %st(0) +8: ret +7: + +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + + andl $0x8000, %ecx // ECX == 0 iff X >= 0 + + fldt MO(ln2_2) // 0.5*ln2 + xorl %ecx, 12(%esp) + fldt 4(%esp) // |x| : 0.5*ln2 + fcoml MO(half) // |x| : 0.5*ln2 + fld %st(0) // |x| : |x| : 0.5*ln2 + fnstsw // |x| : |x| : 0.5*ln2 + sahf + jae 2f + fadd %st, %st(1) // |x| : 2*|x| : 0.5*ln2 + fld %st // |x| : |x| : 2*|x| : 0.5*ln2 + fsubrl MO(one) // 1-|x| : |x| : 2*|x| : 0.5*ln2 + fxch // |x| : 1-|x| : 2*|x| : 0.5*ln2 + fmul %st(2) // 2*|x|^2 : 1-|x| : 2*|x| : 0.5*ln2 + fdivp // (2*|x|^2)/(1-|x|) : 2*|x| : 0.5*ln2 + faddp // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + fcoml MO(limit) // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + fnstsw // 2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + sahf + jae 4f + fyl2xp1 // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|)) + jecxz 3f + fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x)) +3: ret + + .align ALIGNARG(4) +4: faddl MO(one) // 1+2*|x|+(2*|x|^2)/(1-|x|) : 0.5*ln2 + fyl2x // 0.5*ln2*ld(1+2*|x|+(2*|x|^2)/(1-|x|)) + jecxz 3f + fchs // 0.5*ln2*ld(1+2*x+(2*x^2)/(1-x)) +3: ret + + .align ALIGNARG(4) +2: faddl MO(one) // 1+|x| : |x| : 0.5*ln2 + fxch // |x| : 1+|x| : 0.5*ln2 + fsubrl MO(one) // 1-|x| : 1+|x| : 0.5*ln2 + fdivrp // (1+|x|)/(1-|x|) : 0.5*ln2 + fyl2x // 0.5*ln2*ld((1+|x|)/(1-|x|)) + jecxz 3f + fchs // 0.5*ln2*ld((1+x)/(1-x)) +3: ret + + // x == NaN or ±Inf +5: cmpl $0x80000000, 8(%esp) + ja 6f + cmpl $0, 4(%esp) + je 7b +6: fldt 4(%esp) + fadd %st(0) + ret +END(__ieee754_atanhl) +strong_alias (__ieee754_atanhl, __atanhl_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp.S b/REORG.TODO/sysdeps/i386/fpu/e_exp.S new file mode 100644 index 0000000000..a7e7f13f6f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_exp.S @@ -0,0 +1,73 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +DEFINE_DBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +/* e^x = 2^(x * log2(e)) */ +ENTRY(__ieee754_exp) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldl 4(%esp) +/* I added the following ugly construct because exp(+-Inf) resulted + in NaN. The ugliness results from the bright minds at Intel. + For the i686 the code can be written better. + -- drepper@cygnus.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + fldl2e + fmulp /* x * log2(e) */ + fld %st + frndint /* int(x * log2(e)) */ + fsubr %st,%st(1) /* fract(x * log2(e)) */ + fxch + f2xm1 /* 2^(fract(x * log2(e))) - 1 */ + fld1 + faddp /* 2^(fract(x * log2(e))) */ + fscale /* e^x */ + fstp %st(1) + DBL_NARROW_EVAL_UFLOW_NONNEG_NAN + ret + +1: testl $0x200, %eax /* Test sign. */ + jz 2f /* If positive, jump. */ + fstp %st + fldz /* Set result to 0. */ +2: ret +END (__ieee754_exp) + + +ENTRY(__exp_finite) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldl2e + fmull 4(%esp) /* x * log2(e) */ + fld %st + frndint /* int(x * log2(e)) */ + fsubr %st,%st(1) /* fract(x * log2(e)) */ + fxch + f2xm1 /* 2^(fract(x * log2(e))) - 1 */ + fld1 + faddp /* 2^(fract(x * log2(e))) */ + fscale /* e^x */ + fstp %st(1) + DBL_NARROW_EVAL_UFLOW_NONNEG + ret +END(__exp_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10.S new file mode 100644 index 0000000000..acb5160a3f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10.S @@ -0,0 +1,53 @@ +/* + * Written by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +DEFINE_DBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +/* 10^x = 2^(x * log2(10)) */ +ENTRY(__ieee754_exp10) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldl 4(%esp) +/* I added the following ugly construct because exp(+-Inf) resulted + in NaN. The ugliness results from the bright minds at Intel. + For the i686 the code can be written better. + -- drepper@cygnus.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + fldl2t + fmulp /* x * log2(10) */ + fld %st + frndint /* int(x * log2(10)) */ + fsubr %st,%st(1) /* fract(x * log2(10)) */ + fxch + f2xm1 /* 2^(fract(x * log2(10))) - 1 */ + fld1 + faddp /* 2^(fract(x * log2(10))) */ + fscale /* e^x */ + fstp %st(1) + DBL_NARROW_EVAL_UFLOW_NONNEG_NAN + ret + +1: testl $0x200, %eax /* Test sign. */ + jz 2f /* If positive, jump. */ + fstp %st + fldz /* Set result to 0. */ +2: ret +END (__ieee754_exp10) +strong_alias (__ieee754_exp10, __exp10_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S new file mode 100644 index 0000000000..1812b34398 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10f.S @@ -0,0 +1,53 @@ +/* + * Written by Ulrich Drepper. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +DEFINE_FLT_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +/* 10^x = 2^(x * log2(10)) */ +ENTRY(__ieee754_exp10f) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + flds 4(%esp) +/* I added the following ugly construct because exp(+-Inf) resulted + in NaN. The ugliness results from the bright minds at Intel. + For the i686 the code can be written better. + -- drepper@cygnus.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + fldl2t + fmulp /* x * log2(10) */ + fld %st + frndint /* int(x * log2(10)) */ + fsubr %st,%st(1) /* fract(x * log2(10)) */ + fxch + f2xm1 /* 2^(fract(x * log2(10))) - 1 */ + fld1 + faddp /* 2^(fract(x * log2(10))) */ + fscale /* e^x */ + fstp %st(1) + FLT_NARROW_EVAL_UFLOW_NONNEG_NAN + ret + +1: testl $0x200, %eax /* Test sign. */ + jz 2f /* If positive, jump. */ + fstp %st + fldz /* Set result to 0. */ +2: ret +END (__ieee754_exp10f) +strong_alias (__ieee754_exp10f, __exp10f_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S b/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S new file mode 100644 index 0000000000..d843e2b5e8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_exp10l.S @@ -0,0 +1,2 @@ +#define USE_AS_EXP10L +#include <e_expl.S> diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2.S new file mode 100644 index 0000000000..fc16a96053 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2.S @@ -0,0 +1,52 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +DEFINE_DBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_exp2) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldl 4(%esp) +/* I added the following ugly construct because exp(+-Inf) resulted + in NaN. The ugliness results from the bright minds at Intel. + For the i686 the code can be written better. + -- drepper@cygnus.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + fld %st + frndint /* int(x) */ + fsubr %st,%st(1) /* fract(x) */ + fxch + f2xm1 /* 2^(fract(x)) - 1 */ + fld1 + faddp /* 2^(fract(x)) */ + fscale /* e^x */ + fstp %st(1) + DBL_NARROW_EVAL_UFLOW_NONNEG_NAN + ret + +1: testl $0x200, %eax /* Test sign. */ + jz 2f /* If positive, jump. */ + fstp %st + fldz /* Set result to 0. */ +2: ret +END (__ieee754_exp2) +strong_alias (__ieee754_exp2, __exp2_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S new file mode 100644 index 0000000000..30623cd850 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2f.S @@ -0,0 +1,52 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +DEFINE_FLT_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_exp2f) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + flds 4(%esp) +/* I added the following ugly construct because exp(+-Inf) resulted + in NaN. The ugliness results from the bright minds at Intel. + For the i686 the code can be written better. + -- drepper@cygnus.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + fld %st + frndint /* int(x) */ + fsubr %st,%st(1) /* fract(x) */ + fxch + f2xm1 /* 2^(fract(x)) - 1 */ + fld1 + faddp /* 2^(fract(x)) */ + fscale /* e^x */ + fstp %st(1) + FLT_NARROW_EVAL_UFLOW_NONNEG_NAN + ret + +1: testl $0x200, %eax /* Test sign. */ + jz 2f /* If positive, jump. */ + fstp %st + fldz /* Set result to 0. */ +2: ret +END (__ieee754_exp2f) +strong_alias (__ieee754_exp2f, __exp2f_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S b/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S new file mode 100644 index 0000000000..c4cb73d589 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_exp2l.S @@ -0,0 +1,60 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Adapted for exp2 by Ulrich Drepper <drepper@cygnus.com>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +DEFINE_LDBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_exp2l) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldt 4(%esp) +/* I added the following ugly construct because exp(+-Inf) resulted + in NaN. The ugliness results from the bright minds at Intel. + For the i686 the code can be written better. + -- drepper@cygnus.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + movzwl 4+8(%esp), %eax + andl $0x7fff, %eax + cmpl $0x3fbe, %eax + jge 3f + /* Argument's exponent below -65, result rounds to 1. */ + fld1 + faddp + ret +3: fld %st + frndint /* int(x) */ + fsubr %st,%st(1) /* fract(x) */ + fxch + f2xm1 /* 2^(fract(x)) - 1 */ + fld1 + faddp /* 2^(fract(x)) */ + fscale /* e^x */ + fstp %st(1) + LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN + ret + +1: testl $0x200, %eax /* Test sign. */ + jz 2f /* If positive, jump. */ + fstp %st + fldz /* Set result to 0. */ +2: ret +END (__ieee754_exp2l) +strong_alias (__ieee754_exp2l, __exp2l_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_expf.S b/REORG.TODO/sysdeps/i386/fpu/e_expf.S new file mode 100644 index 0000000000..65cb4ec204 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_expf.S @@ -0,0 +1,74 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +DEFINE_FLT_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +/* e^x = 2^(x * log2(e)) */ +ENTRY(__ieee754_expf) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + flds 4(%esp) +/* I added the following ugly construct because exp(+-Inf) resulted + in NaN. The ugliness results from the bright minds at Intel. + For the i686 the code can be written better. + -- drepper@cygnus.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + fldl2e + fmulp /* x * log2(e) */ + fld %st + frndint /* int(x * log2(e)) */ + fsubr %st,%st(1) /* fract(x * log2(e)) */ + fxch + f2xm1 /* 2^(fract(x * log2(e))) - 1 */ + fld1 + faddp /* 2^(fract(x * log2(e))) */ + fscale /* e^x */ + fstp %st(1) + FLT_NARROW_EVAL_UFLOW_NONNEG_NAN + ret + +1: testl $0x200, %eax /* Test sign. */ + jz 2f /* If positive, jump. */ + fstp %st + fldz /* Set result to 0. */ +2: ret +END (__ieee754_expf) + + +ENTRY(__expf_finite) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldl2e + fmuls 4(%esp) /* x * log2(e) */ + fld %st + frndint /* int(x * log2(e)) */ + fsubr %st,%st(1) /* fract(x * log2(e)) */ + fxch + f2xm1 /* 2^(fract(x * log2(e))) - 1 */ + fld1 + faddp /* 2^(fract(x * log2(e))) */ + fscale /* e^x */ + fstp %st(1) + FLT_NARROW_EVAL_UFLOW_NONNEG + ret +END(__expf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_expl.S b/REORG.TODO/sysdeps/i386/fpu/e_expl.S new file mode 100644 index 0000000000..7d75fe22a1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_expl.S @@ -0,0 +1,226 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + */ + +/* + * The 8087 method for the exponential function is to calculate + * exp(x) = 2^(x log2(e)) + * after separating integer and fractional parts + * x log2(e) = i + f, |f| <= .5 + * 2^i is immediate but f needs to be precise for long double accuracy. + * Suppress range reduction error in computing f by the following. + * Separate x into integer and fractional parts + * x = xi + xf, |xf| <= .5 + * Separate log2(e) into the sum of an exact number c0 and small part c1. + * c0 + c1 = log2(e) to extra precision + * Then + * f = (c0 xi - i) + c0 xf + c1 x + * where c0 xi is exact and so also is (c0 xi - i). + * -- moshier@na-net.ornl.gov + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +#ifdef USE_AS_EXP10L +# define IEEE754_EXPL __ieee754_exp10l +# define EXPL_FINITE __exp10l_finite +# define FLDLOG fldl2t +#elif defined USE_AS_EXPM1L +# define IEEE754_EXPL __expm1l +# undef EXPL_FINITE +# define FLDLOG fldl2e +#else +# define IEEE754_EXPL __ieee754_expl +# define EXPL_FINITE __expl_finite +# define FLDLOG fldl2e +#endif + + .section .rodata.cst16,"aM",@progbits,16 + + .p2align 4 +#ifdef USE_AS_EXP10L + .type c0,@object +c0: .byte 0, 0, 0, 0, 0, 0, 0x9a, 0xd4, 0x00, 0x40 + .byte 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(c0) + .type c1,@object +c1: .byte 0x58, 0x92, 0xfc, 0x15, 0x37, 0x9a, 0x97, 0xf0, 0xef, 0x3f + .byte 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(c1) +#else + .type c0,@object +c0: .byte 0, 0, 0, 0, 0, 0, 0xaa, 0xb8, 0xff, 0x3f + .byte 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(c0) + .type c1,@object +c1: .byte 0x20, 0xfa, 0xee, 0xc2, 0x5f, 0x70, 0xa5, 0xec, 0xed, 0x3f + .byte 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(c1) +#endif +#ifndef USE_AS_EXPM1L + .type csat,@object +csat: .byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x0e, 0x40 + .byte 0, 0, 0, 0, 0, 0 + ASM_SIZE_DIRECTIVE(csat) +DEFINE_LDBL_MIN +#endif + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +ENTRY(IEEE754_EXPL) +#ifdef USE_AS_EXPM1L + movzwl 4+8(%esp), %eax + xorb $0x80, %ah // invert sign bit (now 1 is "positive") + cmpl $0xc006, %eax // is num positive and exp >= 6 (number is >= 128.0)? + jae HIDDEN_JUMPTARGET (__expl) // (if num is denormal, it is at least >= 64.0) +#endif + fldt 4(%esp) +/* I added the following ugly construct because expl(+-Inf) resulted + in NaN. The ugliness results from the bright minds at Intel. + For the i686 the code can be written better. + -- drepper@cygnus.com. */ + fxam /* Is NaN or +-Inf? */ +#ifdef PIC + LOAD_PIC_REG (cx) +#endif +#ifdef USE_AS_EXPM1L + xorb $0x80, %ah + cmpl $0xc006, %eax + fstsw %ax + movb $0x45, %dh + jb 4f + + /* Below -64.0 (may be -NaN or -Inf). */ + andb %ah, %dh + cmpb $0x01, %dh + je 6f /* Is +-NaN, jump. */ + jmp 1f /* -large, possibly -Inf. */ + +4: /* In range -64.0 to 64.0 (may be +-0 but not NaN or +-Inf). */ + /* Test for +-0 as argument. */ + andb %ah, %dh + cmpb $0x40, %dh + je 2f + + /* Test for arguments that are small but not subnormal. */ + movzwl 4+8(%esp), %eax + andl $0x7fff, %eax + cmpl $0x3fbf, %eax + jge 3f + /* Argument's exponent below -64; avoid spurious underflow if + normal. */ + cmpl $0x0001, %eax + jge 2f + /* Force underflow and return the argument, to avoid wrong signs + of zero results from the code below in some rounding modes. */ + fld %st + fmul %st + fstp %st + jmp 2f +#else + movzwl 4+8(%esp), %eax + andl $0x7fff, %eax + cmpl $0x400d, %eax + jg 5f + cmpl $0x3fbc, %eax + jge 3f + /* Argument's exponent below -67, result rounds to 1. */ + fld1 + faddp + jmp 2f +5: /* Overflow, underflow or infinity or NaN as argument. */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + cmpb $0x01, %dh + je 6f /* Is +-NaN, jump. */ + /* Overflow or underflow; saturate. */ + fstp %st + fldt MO(csat) + andb $2, %ah + jz 3f + fchs +#endif +3: FLDLOG /* 1 log2(base) */ + fmul %st(1), %st /* 1 x log2(base) */ + /* Set round-to-nearest temporarily. */ + subl $8, %esp + cfi_adjust_cfa_offset (8) + fstcw 4(%esp) + movl $0xf3ff, %edx + andl 4(%esp), %edx + movl %edx, (%esp) + fldcw (%esp) + frndint /* 1 i */ + fld %st(1) /* 2 x */ + frndint /* 2 xi */ + fldcw 4(%esp) + addl $8, %esp + cfi_adjust_cfa_offset (-8) + fld %st(1) /* 3 i */ + fldt MO(c0) /* 4 c0 */ + fld %st(2) /* 5 xi */ + fmul %st(1), %st /* 5 c0 xi */ + fsubp %st, %st(2) /* 4 f = c0 xi - i */ + fld %st(4) /* 5 x */ + fsub %st(3), %st /* 5 xf = x - xi */ + fmulp %st, %st(1) /* 4 c0 xf */ + faddp %st, %st(1) /* 3 f = f + c0 xf */ + fldt MO(c1) /* 4 */ + fmul %st(4), %st /* 4 c1 * x */ + faddp %st, %st(1) /* 3 f = f + c1 * x */ + f2xm1 /* 3 2^(fract(x * log2(base))) - 1 */ +#ifdef USE_AS_EXPM1L + fstp %st(1) /* 2 */ + fscale /* 2 scale factor is st(1); base^x - 2^i */ + fxch /* 2 i */ + fld1 /* 3 1.0 */ + fscale /* 3 2^i */ + fld1 /* 4 1.0 */ + fsubrp %st, %st(1) /* 3 2^i - 1.0 */ + fstp %st(1) /* 2 */ + faddp %st, %st(1) /* 1 base^x - 1.0 */ +#else + fld1 /* 4 1.0 */ + faddp /* 3 2^(fract(x * log2(base))) */ + fstp %st(1) /* 2 */ + fscale /* 2 scale factor is st(1); base^x */ + fstp %st(1) /* 1 */ + LDBL_CHECK_FORCE_UFLOW_NONNEG +#endif + fstp %st(1) /* 0 */ + jmp 2f +1: +#ifdef USE_AS_EXPM1L + /* For expm1l, only negative sign gets here. */ + fstp %st + fld1 + fchs +#else + testl $0x200, %eax /* Test sign. */ + jz 2f /* If positive, jump. */ + fstp %st + fldz /* Set result to 0. */ +#endif +2: ret +6: /* NaN argument. */ + fadd %st + ret +END(IEEE754_EXPL) +#ifdef USE_AS_EXPM1L +libm_hidden_def (__expm1l) +weak_alias (__expm1l, expm1l) +#else +strong_alias (IEEE754_EXPL, EXPL_FINITE) +#endif diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmod.S b/REORG.TODO/sysdeps/i386/fpu/e_fmod.S new file mode 100644 index 0000000000..26b3acc392 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_fmod.S @@ -0,0 +1,18 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +ENTRY(__ieee754_fmod) + fldl 12(%esp) + fldl 4(%esp) +1: fprem + fstsw %ax + sahf + jp 1b + fstp %st(1) + ret +END (__ieee754_fmod) +strong_alias (__ieee754_fmod, __fmod_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S b/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S new file mode 100644 index 0000000000..ece4d98427 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_fmodf.S @@ -0,0 +1,19 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + +ENTRY(__ieee754_fmodf) + flds 8(%esp) + flds 4(%esp) +1: fprem + fstsw %ax + sahf + jp 1b + fstp %st(1) + ret +END(__ieee754_fmodf) +strong_alias (__ieee754_fmodf, __fmodf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c b/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c new file mode 100644 index 0000000000..49700ae8f6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_fmodl.c @@ -0,0 +1,23 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <math_private.h> + +long double +__ieee754_fmodl (long double x, long double y) +{ + long double res; + + asm ("1:\tfprem\n" + "fstsw %%ax\n" + "sahf\n" + "jp 1b\n" + "fstp %%st(1)" + : "=t" (res) : "0" (x), "u" (y) : "ax", "st(1)"); + return res; +} +strong_alias (__ieee754_fmodl, __fmodl_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_hypot.S b/REORG.TODO/sysdeps/i386/fpu/e_hypot.S new file mode 100644 index 0000000000..7403566fd7 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_hypot.S @@ -0,0 +1,75 @@ +/* Compute the hypothenuse of X and Y. + Copyright (C) 1998-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <i386-math-asm.h> + +DEFINE_DBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_hypot) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fldl 4(%esp) // x + fxam + fnstsw + fldl 12(%esp) // y : x + movb %ah, %ch + fxam + fnstsw + movb %ah, %al + orb %ch, %ah + sahf + jc 1f + fmul %st(0) // y * y : x + fxch // x : y * y + fmul %st(0) // x * x : y * y + faddp // x * x + y * y + fsqrt + DBL_NARROW_EVAL_UFLOW_NONNEG +2: ret + + // We have to test whether any of the parameters is Inf. + // In this case the result is infinity. +1: andb $0x45, %al + cmpb $5, %al + je 3f // jump if y is Inf + andb $0x45, %ch + cmpb $5, %ch + jne 4f // jump if x is not Inf + fxch +3: fstp %st(1) + fabs + jmp 2b + +4: testb $1, %al + jnz 5f // y is NaN + fxch +5: fstp %st(1) + jmp 2b + +END(__ieee754_hypot) +strong_alias (__ieee754_hypot, __hypot_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S b/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S new file mode 100644 index 0000000000..6a2c7052b2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_hypotf.S @@ -0,0 +1,64 @@ +/* Compute the hypothenuse of X and Y. + Copyright (C) 1998-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <i386-math-asm.h> + + .text +ENTRY(__ieee754_hypotf) + flds 4(%esp) // x + fxam + fnstsw + flds 8(%esp) // y : x + movb %ah, %ch + fxam + fnstsw + movb %ah, %al + orb %ch, %ah + sahf + jc 1f + fmul %st(0) // y * y : x + fxch // x : y * y + fmul %st(0) // x * x : y * y + faddp // x * x + y * y + fsqrt + FLT_NARROW_EVAL +2: ret + + // We have to test whether any of the parameters is Inf. + // In this case the result is infinity. +1: andb $0x45, %al + cmpb $5, %al + je 3f // jump if y is Inf + andb $0x45, %ch + cmpb $5, %ch + jne 4f // jump if x is not Inf + fxch +3: fstp %st(1) + fabs + jmp 2b + +4: testb $1, %al + jnz 5f // y is NaN + fxch +5: fstp %st(1) + jmp 2b + +END(__ieee754_hypotf) +strong_alias (__ieee754_hypotf, __hypotf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S new file mode 100644 index 0000000000..29ef2214e6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogb.S @@ -0,0 +1,42 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_ilogb.S,v 1.5 1995/10/12 15:53:09 jtc Exp $") + +ENTRY(__ieee754_ilogb) + fldl 4(%esp) +/* I added the following ugly construct because ilogb(+-Inf) is + required to return INT_MAX in ISO C99. + -- jakub@redhat.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + cmpb $0x40, %dh + je 2f /* Is +-0, jump. */ + + fxtract + pushl %eax + cfi_adjust_cfa_offset (4) + fstp %st + + fistpl (%esp) + fwait + popl %eax + cfi_adjust_cfa_offset (-4) + + ret + +1: fstp %st + movl $0x7fffffff, %eax + ret +2: fstp %st + movl $0x80000000, %eax /* FP_ILOGB0 */ + ret +END (__ieee754_ilogb) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S new file mode 100644 index 0000000000..d72de6c84a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogbf.S @@ -0,0 +1,42 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_ilogbf.S,v 1.4 1995/10/22 20:32:43 pk Exp $") + +ENTRY(__ieee754_ilogbf) + flds 4(%esp) +/* I added the following ugly construct because ilogb(+-Inf) is + required to return INT_MAX in ISO C99. + -- jakub@redhat.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + cmpb $0x40, %dh + je 2f /* Is +-0, jump. */ + + fxtract + pushl %eax + cfi_adjust_cfa_offset (4) + fstp %st + + fistpl (%esp) + fwait + popl %eax + cfi_adjust_cfa_offset (-4) + + ret + +1: fstp %st + movl $0x7fffffff, %eax + ret +2: fstp %st + movl $0x80000000, %eax /* FP_ILOGB0 */ + ret +END (__ieee754_ilogbf) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S b/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S new file mode 100644 index 0000000000..60761dfa38 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_ilogbl.S @@ -0,0 +1,43 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: $") + +ENTRY(__ieee754_ilogbl) + fldt 4(%esp) +/* I added the following ugly construct because ilogb(+-Inf) is + required to return INT_MAX in ISO C99. + -- jakub@redhat.com. */ + fxam /* Is NaN or +-Inf? */ + fstsw %ax + movb $0x45, %dh + andb %ah, %dh + cmpb $0x05, %dh + je 1f /* Is +-Inf, jump. */ + cmpb $0x40, %dh + je 2f /* Is +-0, jump. */ + + fxtract + pushl %eax + cfi_adjust_cfa_offset (4) + fstp %st + + fistpl (%esp) + fwait + popl %eax + cfi_adjust_cfa_offset (-4) + + ret + +1: fstp %st + movl $0x7fffffff, %eax + ret +2: fstp %st + movl $0x80000000, %eax /* FP_ILOGB0 */ + ret +END (__ieee754_ilogbl) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log.S b/REORG.TODO/sysdeps/i386/fpu/e_log.S new file mode 100644 index 0000000000..335df22577 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_log.S @@ -0,0 +1,92 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_log) + fldln2 // log(2) + fldl 4(%esp) // x : log(2) + fxam + fnstsw +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + sahf + jc 3f // in case x is NaN or +-Inf +4: fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret + +2: fstp %st(0) // x : log(2) + fyl2x // log(x) + ret + +3: jp 4b // in case x is +-Inf + fstp %st(1) + fstp %st(1) + ret +END (__ieee754_log) + +ENTRY(__log_finite) + fldln2 // log(2) + fldl 4(%esp) // x : log(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2b + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 6f + fabs // log(1) is +0 in all rounding modes. +6: fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret +END(__log_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10.S b/REORG.TODO/sysdeps/i386/fpu/e_log10.S new file mode 100644 index 0000000000..17277084ca --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_log10.S @@ -0,0 +1,68 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_log10) + fldlg2 // log10(2) + fldl 4(%esp) // x : log10(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fxam + fnstsw + fld %st // x : x : log10(2) + sahf + jc 3f // in case x is NaN or ±Inf +4: fsubl MO(one) // x-1 : x : log10(2) + fld %st // x-1 : x-1 : x : log10(2) + fabs // |x-1| : x-1 : x : log10(2) + fcompl MO(limit) // x-1 : x : log10(2) + fnstsw // x-1 : x : log10(2) + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log10(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : log10(2) + fyl2xp1 // log10(x) + ret + +2: fstp %st(0) // x : log10(2) + fyl2x // log10(x) + ret + +3: jp 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + ret +END (__ieee754_log10) +strong_alias (__ieee754_log10, __log10_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10f.S b/REORG.TODO/sysdeps/i386/fpu/e_log10f.S new file mode 100644 index 0000000000..72a3b88251 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_log10f.S @@ -0,0 +1,69 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>. + * + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_log10f) + fldlg2 // log10(2) + flds 4(%esp) // x : log10(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fxam + fnstsw + fld %st // x : x : log10(2) + sahf + jc 3f // in case x is NaN or ±Inf +4: fsubl MO(one) // x-1 : x : log10(2) + fld %st // x-1 : x-1 : x : log10(2) + fabs // |x-1| : x-1 : x : log10(2) + fcompl MO(limit) // x-1 : x : log10(2) + fnstsw // x-1 : x : log10(2) + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log10(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : log10(2) + fyl2xp1 // log10(x) + ret + +2: fstp %st(0) // x : log10(2) + fyl2x // log10(x) + ret + +3: jp 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + ret +END (__ieee754_log10f) +strong_alias (__ieee754_log10f, __log10f_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log10l.S b/REORG.TODO/sysdeps/i386/fpu/e_log10l.S new file mode 100644 index 0000000000..9326b19796 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_log10l.S @@ -0,0 +1,71 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + * + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_log10l) + fldlg2 // log10(2) + fldt 4(%esp) // x : log10(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fxam + fnstsw + fld %st // x : x : log10(2) + sahf + jc 3f // in case x is NaN or ±Inf +4: fsubl MO(one) // x-1 : x : log10(2) + fld %st // x-1 : x-1 : x : log10(2) + fabs // |x-1| : x-1 : x : log10(2) + fcompl MO(limit) // x-1 : x : log10(2) + fnstsw // x-1 : x : log10(2) + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log10(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : log10(2) + fyl2xp1 // log10(x) + ret + +2: fstp %st(0) // x : log10(2) + fyl2x // log10(x) + ret + +3: jp 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + fadd %st(0) + ret +END(__ieee754_log10l) +strong_alias (__ieee754_log10l, __log10l_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2.S b/REORG.TODO/sysdeps/i386/fpu/e_log2.S new file mode 100644 index 0000000000..73ff0fffd3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_log2.S @@ -0,0 +1,69 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>. + * Public domain. + * + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_log2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fldl MO(one) + fldl 4(%esp) // x : 1 + fxam + fnstsw + fld %st // x : x : 1 + sahf + jc 3f // in case x is NaN or ±Inf +4: fsub %st(2), %st // x-1 : x : 1 + fld %st // x-1 : x-1 : x : 1 + fabs // |x-1| : x-1 : x : 1 + fcompl MO(limit) // x-1 : x : 1 + fnstsw // x-1 : x : 1 + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log2(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : 1 + fyl2xp1 // log(x) + ret + +2: fstp %st(0) // x : 1 + fyl2x // log(x) + ret + +3: jp 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + ret +END (__ieee754_log2) +strong_alias (__ieee754_log2, __log2_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2f.S b/REORG.TODO/sysdeps/i386/fpu/e_log2f.S new file mode 100644 index 0000000000..344eeb495e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_log2f.S @@ -0,0 +1,69 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>. + * Public domain. + * + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_log2f) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fldl MO(one) + flds 4(%esp) // x : 1 + fxam + fnstsw + fld %st // x : x : 1 + sahf + jc 3f // in case x is NaN or ±Inf +4: fsub %st(2), %st // x-1 : x : 1 + fld %st // x-1 : x-1 : x : 1 + fabs // |x-1| : x-1 : x : 1 + fcompl MO(limit) // x-1 : x : 1 + fnstsw // x-1 : x : 1 + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log2(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : 1 + fyl2xp1 // log(x) + ret + +2: fstp %st(0) // x : 1 + fyl2x // log(x) + ret + +3: jp 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + ret +END (__ieee754_log2f) +strong_alias (__ieee754_log2f, __log2f_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_log2l.S b/REORG.TODO/sysdeps/i386/fpu/e_log2l.S new file mode 100644 index 0000000000..73e62ea908 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_log2l.S @@ -0,0 +1,70 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Adapted for use as log2 by Ulrich Drepper <drepper@cygnus.com>. + * Public domain. + * + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_log2l) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fldl MO(one) + fldt 4(%esp) // x : 1 + fxam + fnstsw + fld %st // x : x : 1 + sahf + jc 3f // in case x is NaN or ±Inf +4: fsub %st(2), %st // x-1 : x : 1 + fld %st // x-1 : x-1 : x : 1 + fabs // |x-1| : x-1 : x : 1 + fcompl MO(limit) // x-1 : x : 1 + fnstsw // x-1 : x : 1 + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log2(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : 1 + fyl2xp1 // log(x) + ret + +2: fstp %st(0) // x : 1 + fyl2x // log(x) + ret + +3: jp 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + fadd %st(0) + ret +END (__ieee754_log2l) +strong_alias (__ieee754_log2l, __log2l_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_logf.S b/REORG.TODO/sysdeps/i386/fpu/e_logf.S new file mode 100644 index 0000000000..de967a31f5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_logf.S @@ -0,0 +1,93 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * Adapted for float by Ulrich Drepper <drepper@cygnus.com>. + * + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_logf) + fldln2 // log(2) + flds 4(%esp) // x : log(2) + fxam + fnstsw +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + sahf + jc 3f // in case x is NaN or +-Inf +4: fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret + +2: fstp %st(0) // x : log(2) + fyl2x // log(x) + ret + +3: jp 4b // in case x is +-Inf + fstp %st(1) + fstp %st(1) + ret +END (__ieee754_logf) + +ENTRY(__logf_finite) + fldln2 // log(2) + flds 4(%esp) // x : log(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2b + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 6f + fabs // log(1) is +0 in all rounding modes. +6: fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret +END(__logf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_logl.S b/REORG.TODO/sysdeps/i386/fpu/e_logl.S new file mode 100644 index 0000000000..53127d704e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_logl.S @@ -0,0 +1,97 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_logl) + fldln2 // log(2) + fldt 4(%esp) // x : log(2) + fxam + fnstsw +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + sahf + jc 3f // in case x is NaN or +-Inf + movzwl 4+8(%esp), %eax + cmpl $0xc000, %eax + jae 6f // x <= -2, avoid overflow from -LDBL_MAX - 1. +4: fsubl MO(one) // x-1 : x : log(2) +6: fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 5f + fabs // log(1) is +0 in all rounding modes. +5: fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret + +2: fstp %st(0) // x : log(2) + fyl2x // log(x) + ret + +3: jp 4b // in case x is +-Inf + fstp %st(1) + fstp %st(1) + fadd %st(0) + ret +END (__ieee754_logl) + +ENTRY(__logl_finite) + fldln2 // log(2) + fldt 4(%esp) // x : log(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fcompl MO(limit) // x-1 : x : log(2) + fnstsw // x-1 : x : log(2) + andb $0x45, %ah + jz 2b + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 7f + fabs // log(1) is +0 in all rounding modes. +7: fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret +END(__logl_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_pow.S b/REORG.TODO/sysdeps/i386/fpu/e_pow.S new file mode 100644 index 0000000000..2edb9a9fbc --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_pow.S @@ -0,0 +1,456 @@ +/* ix87 specific implementation of pow function. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + .type p63,@object +p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 + ASM_SIZE_DIRECTIVE(p63) + .type p10,@object +p10: .byte 0, 0, 0, 0, 0, 0, 0x90, 0x40 + ASM_SIZE_DIRECTIVE(p10) + + .section .rodata.cst16,"aM",@progbits,16 + + .p2align 3 + .type infinity,@object +inf_zero: +infinity: + .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f + ASM_SIZE_DIRECTIVE(infinity) + .type zero,@object +zero: .double 0.0 + ASM_SIZE_DIRECTIVE(zero) + .type minf_mzero,@object +minf_mzero: +minfinity: + .byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff +mzero: + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + ASM_SIZE_DIRECTIVE(minf_mzero) +DEFINE_DBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +#else +# define MO(op) op +# define MOX(op,x,f) op(,x,f) +#endif + + .text +ENTRY(__ieee754_pow) + fldl 12(%esp) // y + fxam + +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + + fnstsw + movb %ah, %dl + andb $0x45, %ah + cmpb $0x40, %ah // is y == 0 ? + je 11f + + cmpb $0x05, %ah // is y == ±inf ? + je 12f + + cmpb $0x01, %ah // is y == NaN ? + je 30f + + fldl 4(%esp) // x : y + + subl $8,%esp + cfi_adjust_cfa_offset (8) + + fxam + fnstsw + movb %ah, %dh + andb $0x45, %ah + cmpb $0x40, %ah + je 20f // x is ±0 + + cmpb $0x05, %ah + je 15f // x is ±inf + + cmpb $0x01, %ah + je 32f // x is NaN + + fxch // y : x + + /* fistpll raises invalid exception for |y| >= 1L<<63. */ + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(p63) // y : x + fnstsw + sahf + jnc 2f + + /* First see whether `y' is a natural number. In this case we + can use a more precise algorithm. */ + fld %st // y : y : x + fistpll (%esp) // y : x + fildll (%esp) // int(y) : y : x + fucomp %st(1) // y : x + fnstsw + sahf + jne 3f + + /* OK, we have an integer value for y. If large enough that + errors may propagate out of the 11 bits excess precision, use + the algorithm for real exponent instead. */ + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(p10) // y : x + fnstsw + sahf + jnc 2f + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + orl $0, %edx + fstp %st(0) // x + jns 4f // y >= 0, jump + fdivrl MO(one) // 1/x (now referred to as x) + negl %eax + adcl $0, %edx + negl %edx +4: fldl MO(one) // 1 : x + fxch + + /* If y is even, take the absolute value of x. Otherwise, + ensure all intermediate values that might overflow have the + sign of x. */ + testb $1, %al + jnz 6f + fabs + +6: shrdl $1, %edx, %eax + jnc 5f + fxch + fabs + fmul %st(1) // x : ST*x + fxch +5: fld %st // x : x : ST*x + fabs // |x| : x : ST*x + fmulp // |x|*x : ST*x + shrl $1, %edx + movl %eax, %ecx + orl %edx, %ecx + jnz 6b + fstp %st(0) // ST*x +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + DBL_NARROW_EVAL_UFLOW_NONNAN + ret + + /* y is ±NAN */ +30: fldl 4(%esp) // x : y + fldl MO(one) // 1.0 : x : y + fucomp %st(1) // x : y + fnstsw + sahf + je 31f + fxch // y : x +31: fstp %st(1) + ret + + cfi_adjust_cfa_offset (8) +32: addl $8, %esp + cfi_adjust_cfa_offset (-8) + fstp %st(1) + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) +2: // y is a large integer (absolute value at least 1L<<10), but + // may be odd unless at least 1L<<64. So it may be necessary + // to adjust the sign of a negative result afterwards. + fxch // x : y + fabs // |x| : y + fxch // y : x + .align ALIGNARG(4) +3: /* y is a real number. */ + fxch // x : y + fldl MO(one) // 1.0 : x : y + fldl MO(limit) // 0.29 : 1.0 : x : y + fld %st(2) // x : 0.29 : 1.0 : x : y + fsub %st(2) // x-1 : 0.29 : 1.0 : x : y + fabs // |x-1| : 0.29 : 1.0 : x : y + fucompp // 1.0 : x : y + fnstsw + fxch // x : 1.0 : y + sahf + ja 7f + fsub %st(1) // x-1 : 1.0 : y + fyl2xp1 // log2(x) : y + jmp 8f + +7: fyl2x // log2(x) : y +8: fmul %st(1) // y*log2(x) : y + fst %st(1) // y*log2(x) : y*log2(x) + frndint // int(y*log2(x)) : y*log2(x) + fsubr %st, %st(1) // int(y*log2(x)) : fract(y*log2(x)) + fxch // fract(y*log2(x)) : int(y*log2(x)) + f2xm1 // 2^fract(y*log2(x))-1 : int(y*log2(x)) + faddl MO(one) // 2^fract(y*log2(x)) : int(y*log2(x)) + + // Before scaling, we must negate if x is negative and y is an + // odd integer. + testb $2, %dh + jz 291f + // x is negative. If y is an odd integer, negate the result. + fldl 20(%esp) // y : 2^fract(y*log2(x)) : int(y*log2(x)) + fld %st // y : y : 2^fract(y*log2(x)) : int(y*log2(x)) + fabs // |y| : y : 2^fract(y*log2(x)) : int(y*log2(x)) + fcompl MO(p63) // y : 2^fract(y*log2(x)) : int(y*log2(x)) + fnstsw + sahf + jnc 290f + + // We must find out whether y is an odd integer. + fld %st // y : y : 2^fract(y*log2(x)) : int(y*log2(x)) + fistpll (%esp) // y : 2^fract(y*log2(x)) : int(y*log2(x)) + fildll (%esp) // int(y) : y : 2^fract(y*log2(x)) : int(y*log2(x)) + fucompp // 2^fract(y*log2(x)) : int(y*log2(x)) + fnstsw + sahf + jne 291f + + // OK, the value is an integer, but is it odd? + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + andb $1, %al + jz 292f // jump if not odd + // It's an odd integer. + fchs + jmp 292f + + cfi_adjust_cfa_offset (8) +290: fstp %st(0) // 2^fract(y*log2(x)) : int(y*log2(x)) +291: addl $8, %esp + cfi_adjust_cfa_offset (-8) +292: fscale // +/- 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x)) + fstp %st(1) // +/- 2^fract(y*log2(x))*2^int(y*log2(x)) + DBL_NARROW_EVAL_UFLOW_NONNAN + ret + + + // pow(x,±0) = 1 + .align ALIGNARG(4) +11: fstp %st(0) // pop y + fldl MO(one) + ret + + // y == ±inf + .align ALIGNARG(4) +12: fstp %st(0) // pop y + fldl MO(one) // 1 + fldl 4(%esp) // x : 1 + fabs // abs(x) : 1 + fucompp // < 1, == 1, or > 1 + fnstsw + andb $0x45, %ah + cmpb $0x45, %ah + je 13f // jump if x is NaN + + cmpb $0x40, %ah + je 14f // jump if |x| == 1 + + shlb $1, %ah + xorb %ah, %dl + andl $2, %edx + fldl MOX(inf_zero, %edx, 4) + ret + + .align ALIGNARG(4) +14: fldl MO(one) + ret + + .align ALIGNARG(4) +13: fldl 4(%esp) // load x == NaN + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) + // x is ±inf +15: fstp %st(0) // y + testb $2, %dh + jz 16f // jump if x == +inf + + // fistpll raises invalid exception for |y| >= 1L<<63, so test + // that (in which case y is certainly even) before testing + // whether y is odd. + fld %st // y : y + fabs // |y| : y + fcompl MO(p63) // y + fnstsw + sahf + jnc 16f + + // We must find out whether y is an odd integer. + fld %st // y : y + fistpll (%esp) // y + fildll (%esp) // int(y) : y + fucompp // <empty> + fnstsw + sahf + jne 17f + + // OK, the value is an integer. + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + andb $1, %al + jz 18f // jump if not odd + // It's an odd integer. + shrl $31, %edx + fldl MOX(minf_mzero, %edx, 8) + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) +16: fcompl MO(zero) + addl $8, %esp + cfi_adjust_cfa_offset (-8) + fnstsw + shrl $5, %eax + andl $8, %eax + fldl MOX(inf_zero, %eax, 1) + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) +17: shll $30, %edx // sign bit for y in right position + addl $8, %esp + cfi_adjust_cfa_offset (-8) +18: shrl $31, %edx + fldl MOX(inf_zero, %edx, 8) + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) + // x is ±0 +20: fstp %st(0) // y + testb $2, %dl + jz 21f // y > 0 + + // x is ±0 and y is < 0. We must find out whether y is an odd integer. + testb $2, %dh + jz 25f + + // fistpll raises invalid exception for |y| >= 1L<<63, so test + // that (in which case y is certainly even) before testing + // whether y is odd. + fld %st // y : y + fabs // |y| : y + fcompl MO(p63) // y + fnstsw + sahf + jnc 25f + + fld %st // y : y + fistpll (%esp) // y + fildll (%esp) // int(y) : y + fucompp // <empty> + fnstsw + sahf + jne 26f + + // OK, the value is an integer. + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + andb $1, %al + jz 27f // jump if not odd + // It's an odd integer. + // Raise divide-by-zero exception and get minus infinity value. + fldl MO(one) + fdivl MO(zero) + fchs + ret + + cfi_adjust_cfa_offset (8) +25: fstp %st(0) +26: addl $8, %esp + cfi_adjust_cfa_offset (-8) +27: // Raise divide-by-zero exception and get infinity value. + fldl MO(one) + fdivl MO(zero) + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) + // x is ±0 and y is > 0. We must find out whether y is an odd integer. +21: testb $2, %dh + jz 22f + + // fistpll raises invalid exception for |y| >= 1L<<63, so test + // that (in which case y is certainly even) before testing + // whether y is odd. + fcoml MO(p63) // y + fnstsw + sahf + jnc 22f + + fld %st // y : y + fistpll (%esp) // y + fildll (%esp) // int(y) : y + fucompp // <empty> + fnstsw + sahf + jne 23f + + // OK, the value is an integer. + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + andb $1, %al + jz 24f // jump if not odd + // It's an odd integer. + fldl MO(mzero) + ret + + cfi_adjust_cfa_offset (8) +22: fstp %st(0) +23: addl $8, %esp // Don't use 2 x pop + cfi_adjust_cfa_offset (-8) +24: fldl MO(zero) + ret + +END(__ieee754_pow) +strong_alias (__ieee754_pow, __pow_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_powf.S b/REORG.TODO/sysdeps/i386/fpu/e_powf.S new file mode 100644 index 0000000000..467ef2380b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_powf.S @@ -0,0 +1,392 @@ +/* ix87 specific implementation of pow function. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + .type p31,@object +p31: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x41 + ASM_SIZE_DIRECTIVE(p31) + + .section .rodata.cst16,"aM",@progbits,16 + + .p2align 3 + .type infinity,@object +inf_zero: +infinity: + .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f + ASM_SIZE_DIRECTIVE(infinity) + .type zero,@object +zero: .double 0.0 + ASM_SIZE_DIRECTIVE(zero) + .type minf_mzero,@object +minf_mzero: +minfinity: + .byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff +mzero: + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + ASM_SIZE_DIRECTIVE(minf_mzero) +DEFINE_FLT_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +#else +# define MO(op) op +# define MOX(op,x,f) op(,x,f) +#endif + + .text +ENTRY(__ieee754_powf) + flds 8(%esp) // y + fxam + +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + + fnstsw + movb %ah, %dl + andb $0x45, %ah + cmpb $0x40, %ah // is y == 0 ? + je 11f + + cmpb $0x05, %ah // is y == ±inf ? + je 12f + + cmpb $0x01, %ah // is y == NaN ? + je 30f + + flds 4(%esp) // x : y + + subl $4, %esp + cfi_adjust_cfa_offset (4) + + fxam + fnstsw + movb %ah, %dh + andb $0x45, %ah + cmpb $0x40, %ah + je 20f // x is ±0 + + cmpb $0x05, %ah + je 15f // x is ±inf + + cmpb $0x01, %ah + je 33f // x is NaN + + fxch // y : x + + /* fistpl raises invalid exception for |y| >= 1L<<31. */ + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(p31) // y : x + fnstsw + sahf + jnc 2f + + /* First see whether `y' is a natural number. In this case we + can use a more precise algorithm. */ + fld %st // y : y : x + fistpl (%esp) // y : x + fildl (%esp) // int(y) : y : x + fucomp %st(1) // y : x + fnstsw + sahf + jne 3f + + /* OK, we have an integer value for y. */ + popl %edx + cfi_adjust_cfa_offset (-4) + orl $0, %edx + fstp %st(0) // x + jns 4f // y >= 0, jump + fdivrl MO(one) // 1/x (now referred to as x) + negl %edx +4: fldl MO(one) // 1 : x + fxch + + /* If y is even, take the absolute value of x. Otherwise, + ensure all intermediate values that might overflow have the + sign of x. */ + testb $1, %dl + jnz 6f + fabs + +6: shrl $1, %edx + jnc 5f + fxch + fabs + fmul %st(1) // x : ST*x + fxch +5: fld %st // x : x : ST*x + fabs // |x| : x : ST*x + fmulp // |x|*x : ST*x + testl %edx, %edx + jnz 6b + fstp %st(0) // ST*x + FLT_NARROW_EVAL_UFLOW_NONNAN + ret + + /* y is ±NAN */ +30: flds 4(%esp) // x : y + fldl MO(one) // 1.0 : x : y + fucomp %st(1) // x : y + fnstsw + sahf + je 31f + fxch // y : x +31: fstp %st(1) + ret + + cfi_adjust_cfa_offset (4) + .align ALIGNARG(4) +2: /* y is a large integer (so even). */ + fxch // x : y + fabs // |x| : y + fxch // y : x + .align ALIGNARG(4) +3: /* y is a real number. */ + fxch // x : y + fldl MO(one) // 1.0 : x : y + fldl MO(limit) // 0.29 : 1.0 : x : y + fld %st(2) // x : 0.29 : 1.0 : x : y + fsub %st(2) // x-1 : 0.29 : 1.0 : x : y + fabs // |x-1| : 0.29 : 1.0 : x : y + fucompp // 1.0 : x : y + fnstsw + fxch // x : 1.0 : y + sahf + ja 7f + fsub %st(1) // x-1 : 1.0 : y + fyl2xp1 // log2(x) : y + jmp 8f + +7: fyl2x // log2(x) : y +8: fmul %st(1) // y*log2(x) : y + fst %st(1) // y*log2(x) : y*log2(x) + frndint // int(y*log2(x)) : y*log2(x) + fsubr %st, %st(1) // int(y*log2(x)) : fract(y*log2(x)) + fxch // fract(y*log2(x)) : int(y*log2(x)) + f2xm1 // 2^fract(y*log2(x))-1 : int(y*log2(x)) + faddl MO(one) // 2^fract(y*log2(x)) : int(y*log2(x)) + fscale // 2^fract(y*log2(x))*2^int(y*log2(x)) : int(y*log2(x)) +32: addl $4, %esp + cfi_adjust_cfa_offset (-4) + fstp %st(1) // 2^fract(y*log2(x))*2^int(y*log2(x)) + FLT_NARROW_EVAL_UFLOW_NONNAN + ret + + /* x is NaN. */ + cfi_adjust_cfa_offset (4) +33: addl $4, %esp + cfi_adjust_cfa_offset (-4) + fstp %st(1) + ret + + // pow(x,±0) = 1 + .align ALIGNARG(4) +11: fstp %st(0) // pop y + fldl MO(one) + ret + + // y == ±inf + .align ALIGNARG(4) +12: fstp %st(0) // pop y + fldl MO(one) // 1 + flds 4(%esp) // x : 1 + fabs // abs(x) : 1 + fucompp // < 1, == 1, or > 1 + fnstsw + andb $0x45, %ah + cmpb $0x45, %ah + je 13f // jump if x is NaN + + cmpb $0x40, %ah + je 14f // jump if |x| == 1 + + shlb $1, %ah + xorb %ah, %dl + andl $2, %edx + fldl MOX(inf_zero, %edx, 4) + ret + + .align ALIGNARG(4) +14: fldl MO(one) + ret + + .align ALIGNARG(4) +13: flds 4(%esp) // load x == NaN + ret + + cfi_adjust_cfa_offset (4) + .align ALIGNARG(4) + // x is ±inf +15: fstp %st(0) // y + testb $2, %dh + jz 16f // jump if x == +inf + + // fistpl raises invalid exception for |y| >= 1L<<31, so test + // that (in which case y is certainly even) before testing + // whether y is odd. + fld %st // y : y + fabs // |y| : y + fcompl MO(p31) // y + fnstsw + sahf + jnc 16f + + // We must find out whether y is an odd integer. + fld %st // y : y + fistpl (%esp) // y + fildl (%esp) // int(y) : y + fucompp // <empty> + fnstsw + sahf + jne 17f + + // OK, the value is an integer. + popl %edx + cfi_adjust_cfa_offset (-4) + testb $1, %dl + jz 18f // jump if not odd + // It's an odd integer. + shrl $31, %edx + fldl MOX(minf_mzero, %edx, 8) + ret + + cfi_adjust_cfa_offset (4) + .align ALIGNARG(4) +16: fcompl MO(zero) + addl $4, %esp + cfi_adjust_cfa_offset (-4) + fnstsw + shrl $5, %eax + andl $8, %eax + fldl MOX(inf_zero, %eax, 1) + ret + + cfi_adjust_cfa_offset (4) + .align ALIGNARG(4) +17: shll $30, %edx // sign bit for y in right position + addl $4, %esp + cfi_adjust_cfa_offset (-4) +18: shrl $31, %edx + fldl MOX(inf_zero, %edx, 8) + ret + + cfi_adjust_cfa_offset (4) + .align ALIGNARG(4) + // x is ±0 +20: fstp %st(0) // y + testb $2, %dl + jz 21f // y > 0 + + // x is ±0 and y is < 0. We must find out whether y is an odd integer. + testb $2, %dh + jz 25f + + // fistpl raises invalid exception for |y| >= 1L<<31, so test + // that (in which case y is certainly even) before testing + // whether y is odd. + fld %st // y : y + fabs // |y| : y + fcompl MO(p31) // y + fnstsw + sahf + jnc 25f + + fld %st // y : y + fistpl (%esp) // y + fildl (%esp) // int(y) : y + fucompp // <empty> + fnstsw + sahf + jne 26f + + // OK, the value is an integer. + popl %edx + cfi_adjust_cfa_offset (-4) + testb $1, %dl + jz 27f // jump if not odd + // It's an odd integer. + // Raise divide-by-zero exception and get minus infinity value. + fldl MO(one) + fdivl MO(zero) + fchs + ret + + cfi_adjust_cfa_offset (4) +25: fstp %st(0) +26: addl $4, %esp + cfi_adjust_cfa_offset (-4) +27: // Raise divide-by-zero exception and get infinity value. + fldl MO(one) + fdivl MO(zero) + ret + + cfi_adjust_cfa_offset (4) + .align ALIGNARG(4) + // x is ±0 and y is > 0. We must find out whether y is an odd integer. +21: testb $2, %dh + jz 22f + + // fistpl raises invalid exception for |y| >= 1L<<31, so test + // that (in which case y is certainly even) before testing + // whether y is odd. + fcoml MO(p31) // y + fnstsw + sahf + jnc 22f + + fld %st // y : y + fistpl (%esp) // y + fildl (%esp) // int(y) : y + fucompp // <empty> + fnstsw + sahf + jne 23f + + // OK, the value is an integer. + popl %edx + cfi_adjust_cfa_offset (-4) + testb $1, %dl + jz 24f // jump if not odd + // It's an odd integer. + fldl MO(mzero) + ret + + cfi_adjust_cfa_offset (4) +22: fstp %st(0) +23: addl $4, %esp // Don't use pop. + cfi_adjust_cfa_offset (-4) +24: fldl MO(zero) + ret + +END(__ieee754_powf) +strong_alias (__ieee754_powf, __powf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_powl.S b/REORG.TODO/sysdeps/i386/fpu/e_powl.S new file mode 100644 index 0000000000..9e162848e4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_powl.S @@ -0,0 +1,459 @@ +/* ix87 specific implementation of pow function. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type p2,@object +p2: .byte 0, 0, 0, 0, 0, 0, 0x10, 0x40 + ASM_SIZE_DIRECTIVE(p2) + .type p63,@object +p63: .byte 0, 0, 0, 0, 0, 0, 0xe0, 0x43 + ASM_SIZE_DIRECTIVE(p63) + .type p64,@object +p64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43 + ASM_SIZE_DIRECTIVE(p64) + .type p78,@object +p78: .byte 0, 0, 0, 0, 0, 0, 0xd0, 0x44 + ASM_SIZE_DIRECTIVE(p78) + .type pm79,@object +pm79: .byte 0, 0, 0, 0, 0, 0, 0, 0x3b + ASM_SIZE_DIRECTIVE(pm79) + + .section .rodata.cst16,"aM",@progbits,16 + + .p2align 3 + .type infinity,@object +inf_zero: +infinity: + .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x7f + ASM_SIZE_DIRECTIVE(infinity) + .type zero,@object +zero: .double 0.0 + ASM_SIZE_DIRECTIVE(zero) + .type minf_mzero,@object +minf_mzero: +minfinity: + .byte 0, 0, 0, 0, 0, 0, 0xf0, 0xff +mzero: + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + ASM_SIZE_DIRECTIVE(minf_mzero) +DEFINE_LDBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +#else +# define MO(op) op +# define MOX(op,x,f) op(,x,f) +#endif + + .text +ENTRY(__ieee754_powl) + fldt 16(%esp) // y + fxam + +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + + fnstsw + movb %ah, %dl + andb $0x45, %ah + cmpb $0x40, %ah // is y == 0 ? + je 11f + + cmpb $0x05, %ah // is y == ±inf ? + je 12f + + cmpb $0x01, %ah // is y == NaN ? + je 30f + + fldt 4(%esp) // x : y + + subl $8,%esp + cfi_adjust_cfa_offset (8) + + fxam + fnstsw + movb %ah, %dh + andb $0x45, %ah + cmpb $0x40, %ah + je 20f // x is ±0 + + cmpb $0x05, %ah + je 15f // x is ±inf + + cmpb $0x01, %ah + je 32f // x is NaN + + fxch // y : x + + /* fistpll raises invalid exception for |y| >= 1L<<63. */ + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(p63) // y : x + fnstsw + sahf + jnc 2f + + /* First see whether `y' is a natural number. In this case we + can use a more precise algorithm. */ + fld %st // y : y : x + fistpll (%esp) // y : x + fildll (%esp) // int(y) : y : x + fucomp %st(1) // y : x + fnstsw + sahf + je 9f + + // If y has absolute value at most 0x1p-79, then any finite + // nonzero x will result in 1. Saturate y to those bounds to + // avoid underflow in the calculation of y*log2(x). + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(pm79) // y : x + fnstsw + sahf + jnc 3f + fstp %st(0) // pop y + fldl MO(pm79) // 0x1p-79 : x + testb $2, %dl + jnz 3f // y > 0 + fchs // -0x1p-79 : x + jmp 3f + +9: /* OK, we have an integer value for y. Unless very small + (we use < 4), use the algorithm for real exponent to avoid + accumulation of errors. */ + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(p2) // y : x + fnstsw + sahf + jnc 3f + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + orl $0, %edx + fstp %st(0) // x + jns 4f // y >= 0, jump + fdivrl MO(one) // 1/x (now referred to as x) + negl %eax + adcl $0, %edx + negl %edx +4: fldl MO(one) // 1 : x + fxch + + /* If y is even, take the absolute value of x. Otherwise, + ensure all intermediate values that might overflow have the + sign of x. */ + testb $1, %al + jnz 6f + fabs + +6: shrdl $1, %edx, %eax + jnc 5f + fxch + fabs + fmul %st(1) // x : ST*x + fxch +5: fld %st // x : x : ST*x + fabs // |x| : x : ST*x + fmulp // |x|*x : ST*x + shrl $1, %edx + movl %eax, %ecx + orl %edx, %ecx + jnz 6b + fstp %st(0) // ST*x +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + LDBL_CHECK_FORCE_UFLOW_NONNAN + ret + + /* y is ±NAN */ +30: fldt 4(%esp) // x : y + fldl MO(one) // 1.0 : x : y + fucomp %st(1) // x : y + fnstsw + sahf + je 33f +31: /* At least one argument NaN, and result should be NaN. */ + faddp + ret +33: jp 31b + /* pow (1, NaN); check if the NaN signaling. */ + testb $0x40, 23(%esp) + jz 31b + fstp %st(1) + ret + + cfi_adjust_cfa_offset (8) +32: addl $8, %esp + cfi_adjust_cfa_offset (-8) + faddp + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) +2: // y is a large integer (absolute value at least 1L<<63). + // If y has absolute value at least 1L<<78, then any finite + // nonzero x will result in 0 (underflow), 1 or infinity (overflow). + // Saturate y to those bounds to avoid overflow in the calculation + // of y*log2(x). + fld %st // y : y : x + fabs // |y| : y : x + fcompl MO(p78) // y : x + fnstsw + sahf + jc 3f + fstp %st(0) // pop y + fldl MO(p78) // 1L<<78 : x + testb $2, %dl + jz 3f // y > 0 + fchs // -(1L<<78) : x + .align ALIGNARG(4) +3: /* y is a real number. */ + subl $28, %esp + cfi_adjust_cfa_offset (28) + fstpt 12(%esp) // x + fstpt (%esp) // <empty> + call HIDDEN_JUMPTARGET (__powl_helper) // <result> + addl $36, %esp + cfi_adjust_cfa_offset (-36) + ret + + // pow(x,±0) = 1, unless x is sNaN + .align ALIGNARG(4) +11: fstp %st(0) // pop y + fldt 4(%esp) // x + fxam + fnstsw + andb $0x45, %ah + cmpb $0x01, %ah + je 112f // x is NaN +111: fstp %st(0) + fldl MO(one) + ret + +112: testb $0x40, 11(%esp) + jnz 111b + fadd %st(0) + ret + + // y == ±inf + .align ALIGNARG(4) +12: fstp %st(0) // pop y + fldl MO(one) // 1 + fldt 4(%esp) // x : 1 + fabs // abs(x) : 1 + fucompp // < 1, == 1, or > 1 + fnstsw + andb $0x45, %ah + cmpb $0x45, %ah + je 13f // jump if x is NaN + + cmpb $0x40, %ah + je 14f // jump if |x| == 1 + + shlb $1, %ah + xorb %ah, %dl + andl $2, %edx + fldl MOX(inf_zero, %edx, 4) + ret + + .align ALIGNARG(4) +14: fldl MO(one) + ret + + .align ALIGNARG(4) +13: fldt 4(%esp) // load x == NaN + fadd %st(0) + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) + // x is ±inf +15: fstp %st(0) // y + testb $2, %dh + jz 16f // jump if x == +inf + + // fistpll raises invalid exception for |y| >= 1L<<63, but y + // may be odd unless we know |y| >= 1L<<64. + fld %st // y : y + fabs // |y| : y + fcompl MO(p64) // y + fnstsw + sahf + jnc 16f + fldl MO(p63) // p63 : y + fxch // y : p63 + fprem // y%p63 : p63 + fstp %st(1) // y%p63 + + // We must find out whether y is an odd integer. + fld %st // y : y + fistpll (%esp) // y + fildll (%esp) // int(y) : y + fucompp // <empty> + fnstsw + sahf + jne 17f + + // OK, the value is an integer, but is it odd? + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + andb $1, %al + jz 18f // jump if not odd + // It's an odd integer. + shrl $31, %edx + fldl MOX(minf_mzero, %edx, 8) + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) +16: fcompl MO(zero) + addl $8, %esp + cfi_adjust_cfa_offset (-8) + fnstsw + shrl $5, %eax + andl $8, %eax + fldl MOX(inf_zero, %eax, 1) + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) +17: shll $30, %edx // sign bit for y in right position + addl $8, %esp + cfi_adjust_cfa_offset (-8) +18: shrl $31, %edx + fldl MOX(inf_zero, %edx, 8) + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) + // x is ±0 +20: fstp %st(0) // y + testb $2, %dl + jz 21f // y > 0 + + // x is ±0 and y is < 0. We must find out whether y is an odd integer. + testb $2, %dh + jz 25f + + // fistpll raises invalid exception for |y| >= 1L<<63, but y + // may be odd unless we know |y| >= 1L<<64. + fld %st // y : y + fabs // |y| : y + fcompl MO(p64) // y + fnstsw + sahf + jnc 25f + fldl MO(p63) // p63 : y + fxch // y : p63 + fprem // y%p63 : p63 + fstp %st(1) // y%p63 + + fld %st // y : y + fistpll (%esp) // y + fildll (%esp) // int(y) : y + fucompp // <empty> + fnstsw + sahf + jne 26f + + // OK, the value is an integer, but is it odd? + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + andb $1, %al + jz 27f // jump if not odd + // It's an odd integer. + // Raise divide-by-zero exception and get minus infinity value. + fldl MO(one) + fdivl MO(zero) + fchs + ret + + cfi_adjust_cfa_offset (8) +25: fstp %st(0) +26: addl $8, %esp + cfi_adjust_cfa_offset (-8) +27: // Raise divide-by-zero exception and get infinity value. + fldl MO(one) + fdivl MO(zero) + ret + + cfi_adjust_cfa_offset (8) + .align ALIGNARG(4) + // x is ±0 and y is > 0. We must find out whether y is an odd integer. +21: testb $2, %dh + jz 22f + + // fistpll raises invalid exception for |y| >= 1L<<63, but y + // may be odd unless we know |y| >= 1L<<64. + fld %st // y : y + fcompl MO(p64) // y + fnstsw + sahf + jnc 22f + fldl MO(p63) // p63 : y + fxch // y : p63 + fprem // y%p63 : p63 + fstp %st(1) // y%p63 + + fld %st // y : y + fistpll (%esp) // y + fildll (%esp) // int(y) : y + fucompp // <empty> + fnstsw + sahf + jne 23f + + // OK, the value is an integer, but is it odd? + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + andb $1, %al + jz 24f // jump if not odd + // It's an odd integer. + fldl MO(mzero) + ret + + cfi_adjust_cfa_offset (8) +22: fstp %st(0) +23: addl $8, %esp // Don't use 2 x pop + cfi_adjust_cfa_offset (-8) +24: fldl MO(zero) + ret + +END(__ieee754_powl) +strong_alias (__ieee754_powl, __powl_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c b/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c new file mode 100644 index 0000000000..1347b0468c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_rem_pio2.c @@ -0,0 +1,3 @@ +/* Empty. This file is only meant to avoid compiling the file with the + same name in the libm-ieee754 directory. The code is not used since + there is an assembler version for all users of this file. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainder.S b/REORG.TODO/sysdeps/i386/fpu/e_remainder.S new file mode 100644 index 0000000000..f7867aa90b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_remainder.S @@ -0,0 +1,18 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +ENTRY(__ieee754_remainder) + fldl 12(%esp) + fldl 4(%esp) +1: fprem1 + fstsw %ax + sahf + jp 1b + fstp %st(1) + ret +END (__ieee754_remainder) +strong_alias (__ieee754_remainder, __remainder_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S b/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S new file mode 100644 index 0000000000..cfd390bc69 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_remainderf.S @@ -0,0 +1,18 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +ENTRY(__ieee754_remainderf) + flds 8(%esp) + flds 4(%esp) +1: fprem1 + fstsw %ax + sahf + jp 1b + fstp %st(1) + ret +END (__ieee754_remainderf) +strong_alias (__ieee754_remainderf, __remainderf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S b/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S new file mode 100644 index 0000000000..5ec23a37a3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_remainderl.S @@ -0,0 +1,20 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + +ENTRY(__ieee754_remainderl) + fldt 16(%esp) + fldt 4(%esp) +1: fprem1 + fstsw %ax + sahf + jp 1b + fstp %st(1) + ret +END (__ieee754_remainderl) +strong_alias (__ieee754_remainderl, __remainderl_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalb.S b/REORG.TODO/sysdeps/i386/fpu/e_scalb.S new file mode 100644 index 0000000000..370924c29f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_scalb.S @@ -0,0 +1,100 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Correct handling of y==-inf <drepper@gnu> + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type zero_nan,@object +zero_nan: + .double 0.0 +nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f + ASM_SIZE_DIRECTIVE(zero_nan) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +#else +# define MO(op) op +# define MOX(op,x,f) op(,x,f) +#endif + + .text +ENTRY(__ieee754_scalb) + fldl 12(%esp) + fxam + fnstsw + fldl 4(%esp) + andl $0x4700, %eax + cmpl $0x0700, %eax + je 1f + andl $0x4500, %eax + cmpl $0x0100, %eax + je 2f + fxam + fnstsw + andl $0x4500, %eax + cmpl $0x0100, %eax + je 3f + fld %st(1) + frndint + fcomp %st(2) + fnstsw + sahf + jne 4f + fscale + fstp %st(1) + DBL_NARROW_EVAL + ret + + /* y is -inf */ +1: fxam +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fnstsw + movl 8(%esp), %edx + shrl $5, %eax + fstp %st + fstp %st + andl $0x80000000, %edx + andl $0x0228, %eax + cmpl $0x0028, %eax + je 4f + andl $8, %eax + shrl $27, %edx + addl %edx, %eax + fldl MOX(zero_nan, %eax, 1) + ret + + /* The result is NaN, but we must not raise an exception. + So use a variable. */ +2: fstp %st + fstp %st +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldl MO(nan) + ret + + /* The first parameter is a NaN. Return it. */ +3: fstp %st(1) + ret + + /* Return NaN and raise the invalid exception. */ +4: fstp %st + fstp %st + fldz + fdiv %st + ret +END(__ieee754_scalb) +strong_alias (__ieee754_scalb, __scalb_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S b/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S new file mode 100644 index 0000000000..4f2dfa3acf --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_scalbf.S @@ -0,0 +1,102 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * Adapted for float type by Ulrich Drepper <drepper@cygnus.com>. + * + * Correct handling of y==-inf <drepper@gnu> + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type zero_nan,@object +zero_nan: + .double 0.0 +nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f + ASM_SIZE_DIRECTIVE(zero_nan) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +#else +# define MO(op) op +# define MOX(op,x,f) op(,x,f) +#endif + + + .text +ENTRY(__ieee754_scalbf) + flds 8(%esp) + fxam + fnstsw + flds 4(%esp) + andl $0x4700, %eax + cmpl $0x0700, %eax + je 1f + andl $0x4500, %eax + cmpl $0x0100, %eax + je 2f + fxam + fnstsw + andl $0x4500, %eax + cmpl $0x0100, %eax + je 3f + fld %st(1) + frndint + fcomp %st(2) + fnstsw + sahf + jne 4f + fscale + fstp %st(1) + FLT_NARROW_EVAL + ret + + /* y is -inf */ +1: fxam +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fnstsw + movl 4(%esp), %edx + shrl $5, %eax + fstp %st + fstp %st + andl $0x80000000, %edx + andl $0x0228, %eax + cmpl $0x0028, %eax + je 4f + andl $8, %eax + shrl $27, %edx + addl %edx, %eax + fldl MOX(zero_nan, %eax, 1) + ret + + /* The result is NaN, but we must not raise an exception. + So use a variable. */ +2: fstp %st + fstp %st +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldl MO(nan) + ret + + /* The first parameter is a NaN. Return it. */ +3: fstp %st(1) + ret + + /* Return NaN and raise the invalid exception. */ +4: fstp %st + fstp %st + fldz + fdiv %st + ret +END(__ieee754_scalbf) +strong_alias (__ieee754_scalbf, __scalbf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S b/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S new file mode 100644 index 0000000000..896f599cb0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_scalbl.S @@ -0,0 +1,90 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + * + * Correct handling of y==-inf <drepper@gnu> + */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type zero_nan,@object +zero_nan: + .double 0.0 +nan: .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f + .byte 0, 0, 0, 0, 0, 0, 0, 0x80 + .byte 0, 0, 0, 0, 0, 0, 0xff, 0x7f + ASM_SIZE_DIRECTIVE(zero_nan) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +# define MOX(op,x,f) op##@GOTOFF(%ecx,x,f) +#else +# define MO(op) op +# define MOX(op,x,f) op(,x,f) +#endif + + .text +ENTRY(__ieee754_scalbl) + fldt 16(%esp) + fxam + fnstsw + fldt 4(%esp) + andl $0x4700, %eax + cmpl $0x0700, %eax + je 1f + andl $0x4500, %eax + cmpl $0x0100, %eax + je 2f + fxam + fnstsw + andl $0x4500, %eax + cmpl $0x0100, %eax + je 2f + fld %st(1) + frndint + fcomp %st(2) + fnstsw + sahf + jne 4f + fscale + fstp %st(1) + ret + + /* y is -inf */ +1: fxam +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fnstsw + movl 12(%esp), %edx + shrl $5, %eax + fstp %st + fstp %st + andl $0x8000, %edx + andl $0x0228, %eax + cmpl $0x0028, %eax + je 4f + andl $8, %eax + shrl $11, %edx + addl %edx, %eax + fldl MOX(zero_nan, %eax, 1) + ret + + /* The result is NaN; raise an exception for sNaN arguments. */ +2: faddp + ret + + /* Return NaN and raise the invalid exception. */ +4: fstp %st + fstp %st + fldz + fdiv %st + ret +END(__ieee754_scalbl) +strong_alias (__ieee754_scalbl, __scalbl_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S b/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S new file mode 100644 index 0000000000..fba5833a9a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrt.S @@ -0,0 +1,23 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +ENTRY(__ieee754_sqrt) + fldl 4(%esp) + subl $8, %esp + cfi_adjust_cfa_offset (8) + fstcw 4(%esp) + movl $0xfeff, %edx + andl 4(%esp), %edx + movl %edx, (%esp) + fldcw (%esp) + fsqrt + fldcw 4(%esp) + addl $8, %esp + cfi_adjust_cfa_offset (-8) + ret +END (__ieee754_sqrt) +strong_alias (__ieee754_sqrt, __sqrt_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S b/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S new file mode 100644 index 0000000000..6f7e4b015f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrtf.S @@ -0,0 +1,13 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +ENTRY(__ieee754_sqrtf) + flds 4(%esp) + fsqrt + ret +END (__ieee754_sqrtf) +strong_alias (__ieee754_sqrtf, __sqrtf_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c b/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c new file mode 100644 index 0000000000..41bcd7eeb7 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/e_sqrtl.c @@ -0,0 +1,20 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <math_private.h> + +#undef __ieee754_sqrtl +long double +__ieee754_sqrtl (long double x) +{ + long double res; + + asm ("fsqrt" : "=t" (res) : "0" (x)); + + return res; +} +strong_alias (__ieee754_sqrtl, __sqrtl_finite) diff --git a/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c b/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c new file mode 100644 index 0000000000..5d8596964b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fclrexcpt.c @@ -0,0 +1,69 @@ +/* Clear given exceptions in current floating-point environment. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + +int +__feclearexcept (int excepts) +{ + fenv_t temp; + + /* Mask out unsupported bits/exceptions. */ + excepts &= FE_ALL_EXCEPT; + + /* Bah, we have to clear selected exceptions. Since there is no + `fldsw' instruction we have to do it the hard way. */ + __asm__ ("fnstenv %0" : "=m" (*&temp)); + + /* Clear the relevant bits. */ + temp.__status_word &= excepts ^ FE_ALL_EXCEPT; + + /* Put the new data in effect. */ + __asm__ ("fldenv %0" : : "m" (*&temp)); + + /* If the CPU supports SSE, we clear the MXCSR as well. */ + if (HAS_CPU_FEATURE (SSE)) + { + unsigned int xnew_exc; + + /* Get the current MXCSR. */ + __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc)); + + /* Clear the relevant bits. */ + xnew_exc &= ~excepts; + + /* Put the new data in effect. */ + __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc)); + } + + /* Success. */ + return 0; +} + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2) +strong_alias (__feclearexcept, __old_feclearexcept) +compat_symbol (libm, __old_feclearexcept, feclearexcept, GLIBC_2_1); +#endif + +libm_hidden_ver (__feclearexcept, feclearexcept) +versioned_symbol (libm, __feclearexcept, feclearexcept, GLIBC_2_2); diff --git a/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c b/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c new file mode 100644 index 0000000000..f8db665425 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fedisblxcpt.c @@ -0,0 +1,54 @@ +/* Disable floating-point exceptions. + Copyright (C) 1999-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 1999. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + +int +fedisableexcept (int excepts) +{ + unsigned short int new_exc, old_exc; + + /* Get the current control word. */ + __asm__ ("fstcw %0" : "=m" (*&new_exc)); + + old_exc = (~new_exc) & FE_ALL_EXCEPT; + + excepts &= FE_ALL_EXCEPT; + + new_exc |= excepts; + __asm__ ("fldcw %0" : : "m" (*&new_exc)); + + /* If the CPU supports SSE we set the MXCSR as well. */ + if (HAS_CPU_FEATURE (SSE)) + { + unsigned int xnew_exc; + + /* Get the current control word. */ + __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc)); + + xnew_exc |= excepts << 7; + + __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc)); + } + + return old_exc; +} diff --git a/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c b/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c new file mode 100644 index 0000000000..f1c42d7c27 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/feenablxcpt.c @@ -0,0 +1,54 @@ +/* Enable floating-point exceptions. + Copyright (C) 1999-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 1999. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + +int +feenableexcept (int excepts) +{ + unsigned short int new_exc; + unsigned short int old_exc; + + /* Get the current control word. */ + __asm__ ("fstcw %0" : "=m" (*&new_exc)); + + excepts &= FE_ALL_EXCEPT; + old_exc = (~new_exc) & FE_ALL_EXCEPT; + + new_exc &= ~excepts; + __asm__ ("fldcw %0" : : "m" (*&new_exc)); + + /* If the CPU supports SSE we set the MXCSR as well. */ + if (HAS_CPU_FEATURE (SSE)) + { + unsigned int xnew_exc; + + /* Get the current control word. */ + __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc)); + + xnew_exc &= ~(excepts << 7); + + __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc)); + } + + return old_exc; +} diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetenv.c b/REORG.TODO/sysdeps/i386/fpu/fegetenv.c new file mode 100644 index 0000000000..983f6af25e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fegetenv.c @@ -0,0 +1,49 @@ +/* Store current floating-point environment. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + +int +__fegetenv (fenv_t *envp) +{ + __asm__ ("fnstenv %0" : "=m" (*envp)); + /* And load it right back since the processor changes the mask. + Intel thought this opcode to be used in interrupt handlers which + would block all exceptions. */ + __asm__ ("fldenv %0" : : "m" (*envp)); + + if (HAS_CPU_FEATURE (SSE)) + __asm__ ("stmxcsr %0" : "=m" (envp->__eip)); + + /* Success. */ + return 0; +} + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2) +strong_alias (__fegetenv, __old_fegetenv) +compat_symbol (libm, __old_fegetenv, fegetenv, GLIBC_2_1); +#endif + +libm_hidden_def (__fegetenv) +libm_hidden_ver (__fegetenv, fegetenv) +versioned_symbol (libm, __fegetenv, fegetenv, GLIBC_2_2); diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c b/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c new file mode 100644 index 0000000000..dc87b7a470 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fegetexcept.c @@ -0,0 +1,31 @@ +/* Get enabled floating-point exceptions. + Copyright (C) 1999-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Andreas Jaeger <aj@suse.de>, 1999. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +fegetexcept (void) +{ + unsigned short int exc; + + /* Get the current control word. */ + __asm__ ("fstcw %0" : "=m" (*&exc)); + + return (~exc) & FE_ALL_EXCEPT; +} diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetmode.c b/REORG.TODO/sysdeps/i386/fpu/fegetmode.c new file mode 100644 index 0000000000..abbce3075f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fegetmode.c @@ -0,0 +1,32 @@ +/* Store current floating-point control modes. i386 version. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + +int +fegetmode (femode_t *modep) +{ + _FPU_GETCW (modep->__control_word); + if (HAS_CPU_FEATURE (SSE)) + __asm__ ("stmxcsr %0" : "=m" (modep->__mxcsr)); + return 0; +} diff --git a/REORG.TODO/sysdeps/i386/fpu/fegetround.c b/REORG.TODO/sysdeps/i386/fpu/fegetround.c new file mode 100644 index 0000000000..8ce8b859d8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fegetround.c @@ -0,0 +1,33 @@ +/* Return current rounding direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +__fegetround (void) +{ + int cw; + + __asm__ ("fnstcw %0" : "=m" (*&cw)); + + return cw & 0xc00; +} +libm_hidden_def (__fegetround) +weak_alias (__fegetround, fegetround) +libm_hidden_weak (fegetround) diff --git a/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c b/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c new file mode 100644 index 0000000000..d327358913 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/feholdexcpt.c @@ -0,0 +1,50 @@ +/* Store current floating-point environment and clear exceptions. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + +int +__feholdexcept (fenv_t *envp) +{ + /* Store the environment. Recall that fnstenv has a side effect of + masking all exceptions. Then clear all exceptions. */ + __asm__ volatile ("fnstenv %0; fnclex" : "=m" (*envp)); + + /* If the CPU supports SSE we set the MXCSR as well. */ + if (HAS_CPU_FEATURE (SSE)) + { + unsigned int xwork; + + /* Get the current control word. */ + __asm__ ("stmxcsr %0" : "=m" (envp->__eip)); + + /* Set all exceptions to non-stop and clear them. */ + xwork = (envp->__eip | 0x1f80) & ~0x3f; + + __asm__ ("ldmxcsr %0" : : "m" (*&xwork)); + } + + return 0; +} +libm_hidden_def (__feholdexcept) +weak_alias (__feholdexcept, feholdexcept) +libm_hidden_weak (feholdexcept) diff --git a/REORG.TODO/sysdeps/i386/fpu/fenv_private.h b/REORG.TODO/sysdeps/i386/fpu/fenv_private.h new file mode 100644 index 0000000000..e20e1f1662 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fenv_private.h @@ -0,0 +1,501 @@ +#ifndef FENV_PRIVATE_H +#define FENV_PRIVATE_H 1 + +#include <fenv.h> +#include <fpu_control.h> + +#ifdef __SSE2_MATH__ +# define math_opt_barrier(x) \ + ({ __typeof(x) __x; \ + if (sizeof (x) <= sizeof (double)) \ + __asm ("" : "=x" (__x) : "0" (x)); \ + else \ + __asm ("" : "=t" (__x) : "0" (x)); \ + __x; }) +# define math_force_eval(x) \ + do { \ + if (sizeof (x) <= sizeof (double)) \ + __asm __volatile ("" : : "x" (x)); \ + else \ + __asm __volatile ("" : : "f" (x)); \ + } while (0) +#else +# define math_opt_barrier(x) \ + ({ __typeof (x) __x; \ + __asm ("" : "=t" (__x) : "0" (x)); \ + __x; }) +# define math_force_eval(x) \ + do { \ + __typeof (x) __x = (x); \ + if (sizeof (x) <= sizeof (double)) \ + __asm __volatile ("" : : "m" (__x)); \ + else \ + __asm __volatile ("" : : "f" (__x)); \ + } while (0) +#endif + +/* This file is used by both the 32- and 64-bit ports. The 64-bit port + has a field in the fenv_t for the mxcsr; the 32-bit port does not. + Instead, we (ab)use the only 32-bit field extant in the struct. */ +#ifndef __x86_64__ +# define __mxcsr __eip +#endif + + +/* All of these functions are private to libm, and are all used in pairs + to save+change the fp state and restore the original state. Thus we + need not care for both the 387 and the sse unit, only the one we're + actually using. */ + +#if defined __AVX__ || defined SSE2AVX +# define STMXCSR "vstmxcsr" +# define LDMXCSR "vldmxcsr" +#else +# define STMXCSR "stmxcsr" +# define LDMXCSR "ldmxcsr" +#endif + +static __always_inline void +libc_feholdexcept_sse (fenv_t *e) +{ + unsigned int mxcsr; + asm (STMXCSR " %0" : "=m" (*&mxcsr)); + e->__mxcsr = mxcsr; + mxcsr = (mxcsr | 0x1f80) & ~0x3f; + asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr)); +} + +static __always_inline void +libc_feholdexcept_387 (fenv_t *e) +{ + /* Recall that fnstenv has a side-effect of masking exceptions. + Clobber all of the fp registers so that the TOS field is 0. */ + asm volatile ("fnstenv %0; fnclex" + : "=m"(*e) + : : "st", "st(1)", "st(2)", "st(3)", + "st(4)", "st(5)", "st(6)", "st(7)"); +} + +static __always_inline void +libc_fesetround_sse (int r) +{ + unsigned int mxcsr; + asm (STMXCSR " %0" : "=m" (*&mxcsr)); + mxcsr = (mxcsr & ~0x6000) | (r << 3); + asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr)); +} + +static __always_inline void +libc_fesetround_387 (int r) +{ + fpu_control_t cw; + _FPU_GETCW (cw); + cw = (cw & ~0xc00) | r; + _FPU_SETCW (cw); +} + +static __always_inline void +libc_feholdexcept_setround_sse (fenv_t *e, int r) +{ + unsigned int mxcsr; + asm (STMXCSR " %0" : "=m" (*&mxcsr)); + e->__mxcsr = mxcsr; + mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3); + asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr)); +} + +/* Set both rounding mode and precision. A convenience function for use + by libc_feholdexcept_setround and libc_feholdexcept_setround_53bit. */ +static __always_inline void +libc_feholdexcept_setround_387_prec (fenv_t *e, int r) +{ + libc_feholdexcept_387 (e); + + fpu_control_t cw = e->__control_word; + cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED); + cw |= r | 0x3f; + _FPU_SETCW (cw); +} + +static __always_inline void +libc_feholdexcept_setround_387 (fenv_t *e, int r) +{ + libc_feholdexcept_setround_387_prec (e, r | _FPU_EXTENDED); +} + +static __always_inline void +libc_feholdexcept_setround_387_53bit (fenv_t *e, int r) +{ + libc_feholdexcept_setround_387_prec (e, r | _FPU_DOUBLE); +} + +static __always_inline int +libc_fetestexcept_sse (int e) +{ + unsigned int mxcsr; + asm volatile (STMXCSR " %0" : "=m" (*&mxcsr)); + return mxcsr & e & FE_ALL_EXCEPT; +} + +static __always_inline int +libc_fetestexcept_387 (int ex) +{ + fexcept_t temp; + asm volatile ("fnstsw %0" : "=a" (temp)); + return temp & ex & FE_ALL_EXCEPT; +} + +static __always_inline void +libc_fesetenv_sse (fenv_t *e) +{ + asm volatile (LDMXCSR " %0" : : "m" (e->__mxcsr)); +} + +static __always_inline void +libc_fesetenv_387 (fenv_t *e) +{ + /* Clobber all fp registers so that the TOS value we saved earlier is + compatible with the current state of the compiler. */ + asm volatile ("fldenv %0" + : : "m" (*e) + : "st", "st(1)", "st(2)", "st(3)", + "st(4)", "st(5)", "st(6)", "st(7)"); +} + +static __always_inline int +libc_feupdateenv_test_sse (fenv_t *e, int ex) +{ + unsigned int mxcsr, old_mxcsr, cur_ex; + asm volatile (STMXCSR " %0" : "=m" (*&mxcsr)); + cur_ex = mxcsr & FE_ALL_EXCEPT; + + /* Merge current exceptions with the old environment. */ + old_mxcsr = e->__mxcsr; + mxcsr = old_mxcsr | cur_ex; + asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr)); + + /* Raise SIGFPE for any new exceptions since the hold. Expect that + the normal environment has all exceptions masked. */ + if (__glibc_unlikely (~(old_mxcsr >> 7) & cur_ex)) + __feraiseexcept (cur_ex); + + /* Test for exceptions raised since the hold. */ + return cur_ex & ex; +} + +static __always_inline int +libc_feupdateenv_test_387 (fenv_t *e, int ex) +{ + fexcept_t cur_ex; + + /* Save current exceptions. */ + asm volatile ("fnstsw %0" : "=a" (cur_ex)); + cur_ex &= FE_ALL_EXCEPT; + + /* Reload original environment. */ + libc_fesetenv_387 (e); + + /* Merge current exceptions. */ + __feraiseexcept (cur_ex); + + /* Test for exceptions raised since the hold. */ + return cur_ex & ex; +} + +static __always_inline void +libc_feupdateenv_sse (fenv_t *e) +{ + libc_feupdateenv_test_sse (e, 0); +} + +static __always_inline void +libc_feupdateenv_387 (fenv_t *e) +{ + libc_feupdateenv_test_387 (e, 0); +} + +static __always_inline void +libc_feholdsetround_sse (fenv_t *e, int r) +{ + unsigned int mxcsr; + asm (STMXCSR " %0" : "=m" (*&mxcsr)); + e->__mxcsr = mxcsr; + mxcsr = (mxcsr & ~0x6000) | (r << 3); + asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr)); +} + +static __always_inline void +libc_feholdsetround_387_prec (fenv_t *e, int r) +{ + fpu_control_t cw; + + _FPU_GETCW (cw); + e->__control_word = cw; + cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED); + cw |= r; + _FPU_SETCW (cw); +} + +static __always_inline void +libc_feholdsetround_387 (fenv_t *e, int r) +{ + libc_feholdsetround_387_prec (e, r | _FPU_EXTENDED); +} + +static __always_inline void +libc_feholdsetround_387_53bit (fenv_t *e, int r) +{ + libc_feholdsetround_387_prec (e, r | _FPU_DOUBLE); +} + +static __always_inline void +libc_feresetround_sse (fenv_t *e) +{ + unsigned int mxcsr; + asm (STMXCSR " %0" : "=m" (*&mxcsr)); + mxcsr = (mxcsr & ~0x6000) | (e->__mxcsr & 0x6000); + asm volatile (LDMXCSR " %0" : : "m" (*&mxcsr)); +} + +static __always_inline void +libc_feresetround_387 (fenv_t *e) +{ + _FPU_SETCW (e->__control_word); +} + +#ifdef __SSE_MATH__ +# define libc_feholdexceptf libc_feholdexcept_sse +# define libc_fesetroundf libc_fesetround_sse +# define libc_feholdexcept_setroundf libc_feholdexcept_setround_sse +# define libc_fetestexceptf libc_fetestexcept_sse +# define libc_fesetenvf libc_fesetenv_sse +# define libc_feupdateenv_testf libc_feupdateenv_test_sse +# define libc_feupdateenvf libc_feupdateenv_sse +# define libc_feholdsetroundf libc_feholdsetround_sse +# define libc_feresetroundf libc_feresetround_sse +#else +# define libc_feholdexceptf libc_feholdexcept_387 +# define libc_fesetroundf libc_fesetround_387 +# define libc_feholdexcept_setroundf libc_feholdexcept_setround_387 +# define libc_fetestexceptf libc_fetestexcept_387 +# define libc_fesetenvf libc_fesetenv_387 +# define libc_feupdateenv_testf libc_feupdateenv_test_387 +# define libc_feupdateenvf libc_feupdateenv_387 +# define libc_feholdsetroundf libc_feholdsetround_387 +# define libc_feresetroundf libc_feresetround_387 +#endif /* __SSE_MATH__ */ + +#ifdef __SSE2_MATH__ +# define libc_feholdexcept libc_feholdexcept_sse +# define libc_fesetround libc_fesetround_sse +# define libc_feholdexcept_setround libc_feholdexcept_setround_sse +# define libc_fetestexcept libc_fetestexcept_sse +# define libc_fesetenv libc_fesetenv_sse +# define libc_feupdateenv_test libc_feupdateenv_test_sse +# define libc_feupdateenv libc_feupdateenv_sse +# define libc_feholdsetround libc_feholdsetround_sse +# define libc_feresetround libc_feresetround_sse +#else +# define libc_feholdexcept libc_feholdexcept_387 +# define libc_fesetround libc_fesetround_387 +# define libc_feholdexcept_setround libc_feholdexcept_setround_387 +# define libc_fetestexcept libc_fetestexcept_387 +# define libc_fesetenv libc_fesetenv_387 +# define libc_feupdateenv_test libc_feupdateenv_test_387 +# define libc_feupdateenv libc_feupdateenv_387 +# define libc_feholdsetround libc_feholdsetround_387 +# define libc_feresetround libc_feresetround_387 +#endif /* __SSE2_MATH__ */ + +#define libc_feholdexceptl libc_feholdexcept_387 +#define libc_fesetroundl libc_fesetround_387 +#define libc_feholdexcept_setroundl libc_feholdexcept_setround_387 +#define libc_fetestexceptl libc_fetestexcept_387 +#define libc_fesetenvl libc_fesetenv_387 +#define libc_feupdateenv_testl libc_feupdateenv_test_387 +#define libc_feupdateenvl libc_feupdateenv_387 +#define libc_feholdsetroundl libc_feholdsetround_387 +#define libc_feresetroundl libc_feresetround_387 + +#ifndef __SSE2_MATH__ +# define libc_feholdexcept_setround_53bit libc_feholdexcept_setround_387_53bit +# define libc_feholdsetround_53bit libc_feholdsetround_387_53bit +#endif + +/* We have support for rounding mode context. */ +#define HAVE_RM_CTX 1 + +static __always_inline void +libc_feholdexcept_setround_sse_ctx (struct rm_ctx *ctx, int r) +{ + unsigned int mxcsr, new_mxcsr; + asm (STMXCSR " %0" : "=m" (*&mxcsr)); + new_mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3); + + ctx->env.__mxcsr = mxcsr; + if (__glibc_unlikely (mxcsr != new_mxcsr)) + { + asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr)); + ctx->updated_status = true; + } + else + ctx->updated_status = false; +} + +/* Unconditional since we want to overwrite any exceptions that occurred in the + context. This is also why all fehold* functions unconditionally write into + ctx->env. */ +static __always_inline void +libc_fesetenv_sse_ctx (struct rm_ctx *ctx) +{ + libc_fesetenv_sse (&ctx->env); +} + +static __always_inline void +libc_feupdateenv_sse_ctx (struct rm_ctx *ctx) +{ + if (__glibc_unlikely (ctx->updated_status)) + libc_feupdateenv_test_sse (&ctx->env, 0); +} + +static __always_inline void +libc_feholdexcept_setround_387_prec_ctx (struct rm_ctx *ctx, int r) +{ + libc_feholdexcept_387 (&ctx->env); + + fpu_control_t cw = ctx->env.__control_word; + fpu_control_t old_cw = cw; + cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED); + cw |= r | 0x3f; + + if (__glibc_unlikely (old_cw != cw)) + { + _FPU_SETCW (cw); + ctx->updated_status = true; + } + else + ctx->updated_status = false; +} + +static __always_inline void +libc_feholdexcept_setround_387_ctx (struct rm_ctx *ctx, int r) +{ + libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_EXTENDED); +} + +static __always_inline void +libc_feholdexcept_setround_387_53bit_ctx (struct rm_ctx *ctx, int r) +{ + libc_feholdexcept_setround_387_prec_ctx (ctx, r | _FPU_DOUBLE); +} + +static __always_inline void +libc_feholdsetround_387_prec_ctx (struct rm_ctx *ctx, int r) +{ + fpu_control_t cw, new_cw; + + _FPU_GETCW (cw); + new_cw = cw; + new_cw &= ~(_FPU_RC_ZERO | _FPU_EXTENDED); + new_cw |= r; + + ctx->env.__control_word = cw; + if (__glibc_unlikely (new_cw != cw)) + { + _FPU_SETCW (new_cw); + ctx->updated_status = true; + } + else + ctx->updated_status = false; +} + +static __always_inline void +libc_feholdsetround_387_ctx (struct rm_ctx *ctx, int r) +{ + libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_EXTENDED); +} + +static __always_inline void +libc_feholdsetround_387_53bit_ctx (struct rm_ctx *ctx, int r) +{ + libc_feholdsetround_387_prec_ctx (ctx, r | _FPU_DOUBLE); +} + +static __always_inline void +libc_feholdsetround_sse_ctx (struct rm_ctx *ctx, int r) +{ + unsigned int mxcsr, new_mxcsr; + + asm (STMXCSR " %0" : "=m" (*&mxcsr)); + new_mxcsr = (mxcsr & ~0x6000) | (r << 3); + + ctx->env.__mxcsr = mxcsr; + if (__glibc_unlikely (new_mxcsr != mxcsr)) + { + asm volatile (LDMXCSR " %0" : : "m" (*&new_mxcsr)); + ctx->updated_status = true; + } + else + ctx->updated_status = false; +} + +static __always_inline void +libc_feresetround_sse_ctx (struct rm_ctx *ctx) +{ + if (__glibc_unlikely (ctx->updated_status)) + libc_feresetround_sse (&ctx->env); +} + +static __always_inline void +libc_feresetround_387_ctx (struct rm_ctx *ctx) +{ + if (__glibc_unlikely (ctx->updated_status)) + _FPU_SETCW (ctx->env.__control_word); +} + +static __always_inline void +libc_feupdateenv_387_ctx (struct rm_ctx *ctx) +{ + if (__glibc_unlikely (ctx->updated_status)) + libc_feupdateenv_test_387 (&ctx->env, 0); +} + +#ifdef __SSE_MATH__ +# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_sse_ctx +# define libc_fesetenvf_ctx libc_fesetenv_sse_ctx +# define libc_feupdateenvf_ctx libc_feupdateenv_sse_ctx +# define libc_feholdsetroundf_ctx libc_feholdsetround_sse_ctx +# define libc_feresetroundf_ctx libc_feresetround_sse_ctx +#else +# define libc_feholdexcept_setroundf_ctx libc_feholdexcept_setround_387_ctx +# define libc_feupdateenvf_ctx libc_feupdateenv_387_ctx +# define libc_feholdsetroundf_ctx libc_feholdsetround_387_ctx +# define libc_feresetroundf_ctx libc_feresetround_387_ctx +#endif /* __SSE_MATH__ */ + +#ifdef __SSE2_MATH__ +# define libc_feholdexcept_setround_ctx libc_feholdexcept_setround_sse_ctx +# define libc_fesetenv_ctx libc_fesetenv_sse_ctx +# define libc_feupdateenv_ctx libc_feupdateenv_sse_ctx +# define libc_feholdsetround_ctx libc_feholdsetround_sse_ctx +# define libc_feresetround_ctx libc_feresetround_sse_ctx +#else +# define libc_feholdexcept_setround_ctx libc_feholdexcept_setround_387_ctx +# define libc_feupdateenv_ctx libc_feupdateenv_387_ctx +# define libc_feholdsetround_ctx libc_feholdsetround_387_ctx +# define libc_feresetround_ctx libc_feresetround_387_ctx +#endif /* __SSE2_MATH__ */ + +#define libc_feholdexcept_setroundl_ctx libc_feholdexcept_setround_387_ctx +#define libc_feupdateenvl_ctx libc_feupdateenv_387_ctx +#define libc_feholdsetroundl_ctx libc_feholdsetround_387_ctx +#define libc_feresetroundl_ctx libc_feresetround_387_ctx + +#ifndef __SSE2_MATH__ +# define libc_feholdsetround_53bit_ctx libc_feholdsetround_387_53bit_ctx +# define libc_feresetround_53bit_ctx libc_feresetround_387_ctx +#endif + +#undef __mxcsr + +#endif /* FENV_PRIVATE_H */ diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetenv.c b/REORG.TODO/sysdeps/i386/fpu/fesetenv.c new file mode 100644 index 0000000000..a338e5d555 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fesetenv.c @@ -0,0 +1,131 @@ +/* Install given floating-point environment. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> +#include <assert.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + + +/* All exceptions, including the x86-specific "denormal operand" + exception. */ +#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM) + + +int +__fesetenv (const fenv_t *envp) +{ + fenv_t temp; + + /* The memory block used by fstenv/fldenv has a size of 28 bytes. */ + assert (sizeof (fenv_t) == 28); + + /* Install the environment specified by ENVP. But there are a few + values which we do not want to come from the saved environment. + Therefore, we get the current environment and replace the values + we want to use from the environment specified by the parameter. */ + __asm__ ("fnstenv %0" : "=m" (*&temp)); + + if (envp == FE_DFL_ENV) + { + temp.__control_word |= FE_ALL_EXCEPT_X86; + temp.__control_word &= ~FE_TOWARDZERO; + temp.__control_word |= _FPU_EXTENDED; + temp.__status_word &= ~FE_ALL_EXCEPT_X86; + } + else if (envp == FE_NOMASK_ENV) + { + temp.__control_word &= ~(FE_ALL_EXCEPT | FE_TOWARDZERO); + /* Keep the "denormal operand" exception masked. */ + temp.__control_word |= __FE_DENORM; + temp.__control_word |= _FPU_EXTENDED; + temp.__status_word &= ~FE_ALL_EXCEPT_X86; + } + else + { + temp.__control_word &= ~(FE_ALL_EXCEPT_X86 + | FE_TOWARDZERO + | _FPU_EXTENDED); + temp.__control_word |= (envp->__control_word + & (FE_ALL_EXCEPT_X86 + | FE_TOWARDZERO + | _FPU_EXTENDED)); + temp.__status_word &= ~FE_ALL_EXCEPT_X86; + temp.__status_word |= envp->__status_word & FE_ALL_EXCEPT_X86; + } + temp.__eip = 0; + temp.__cs_selector = 0; + temp.__opcode = 0; + temp.__data_offset = 0; + temp.__data_selector = 0; + + __asm__ ("fldenv %0" : : "m" (temp)); + + if (HAS_CPU_FEATURE (SSE)) + { + unsigned int mxcsr; + __asm__ ("stmxcsr %0" : "=m" (mxcsr)); + + if (envp == FE_DFL_ENV) + { + /* Clear SSE exceptions. */ + mxcsr &= ~FE_ALL_EXCEPT_X86; + /* Set mask for SSE MXCSR. */ + mxcsr |= (FE_ALL_EXCEPT_X86 << 7); + /* Set rounding to FE_TONEAREST. */ + mxcsr &= ~0x6000; + mxcsr |= (FE_TONEAREST << 3); + /* Clear the FZ and DAZ bits. */ + mxcsr &= ~0x8040; + } + else if (envp == FE_NOMASK_ENV) + { + /* Clear SSE exceptions. */ + mxcsr &= ~FE_ALL_EXCEPT_X86; + /* Do not mask exceptions. */ + mxcsr &= ~(FE_ALL_EXCEPT << 7); + /* Keep the "denormal operand" exception masked. */ + mxcsr |= (__FE_DENORM << 7); + /* Set rounding to FE_TONEAREST. */ + mxcsr &= ~0x6000; + mxcsr |= (FE_TONEAREST << 3); + /* Clear the FZ and DAZ bits. */ + mxcsr &= ~0x8040; + } + else + mxcsr = envp->__eip; + + __asm__ ("ldmxcsr %0" : : "m" (mxcsr)); + } + + /* Success. */ + return 0; +} + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2) +strong_alias (__fesetenv, __old_fesetenv) +compat_symbol (libm, __old_fesetenv, fesetenv, GLIBC_2_1); +#endif + +libm_hidden_def (__fesetenv) +libm_hidden_ver (__fesetenv, fesetenv) +versioned_symbol (libm, __fesetenv, fesetenv, GLIBC_2_2); diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c b/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c new file mode 100644 index 0000000000..adfcf17ba6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fesetexcept.c @@ -0,0 +1,31 @@ +/* Set given exception flags. i386 version. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> + +int +fesetexcept (int excepts) +{ + fenv_t temp; + + __asm__ ("fnstenv %0" : "=m" (*&temp)); + temp.__status_word |= excepts & FE_ALL_EXCEPT; + __asm__ ("fldenv %0" : : "m" (*&temp)); + + return 0; +} diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetmode.c b/REORG.TODO/sysdeps/i386/fpu/fesetmode.c new file mode 100644 index 0000000000..bd9f74cd97 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fesetmode.c @@ -0,0 +1,54 @@ +/* Install given floating-point control modes. i386 version. + Copyright (C) 2016-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <fpu_control.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + +/* All exceptions, including the x86-specific "denormal operand" + exception. */ +#define FE_ALL_EXCEPT_X86 (FE_ALL_EXCEPT | __FE_DENORM) + +int +fesetmode (const femode_t *modep) +{ + fpu_control_t cw; + if (modep == FE_DFL_MODE) + cw = _FPU_DEFAULT; + else + cw = modep->__control_word; + _FPU_SETCW (cw); + if (HAS_CPU_FEATURE (SSE)) + { + unsigned int mxcsr; + __asm__ ("stmxcsr %0" : "=m" (mxcsr)); + /* Preserve SSE exception flags but restore other state in + MXCSR. */ + mxcsr &= FE_ALL_EXCEPT_X86; + if (modep == FE_DFL_MODE) + /* Default MXCSR state has all bits zero except for those + masking exceptions. */ + mxcsr |= FE_ALL_EXCEPT_X86 << 7; + else + mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86; + __asm__ ("ldmxcsr %0" : : "m" (mxcsr)); + } + return 0; +} diff --git a/REORG.TODO/sysdeps/i386/fpu/fesetround.c b/REORG.TODO/sysdeps/i386/fpu/fesetround.c new file mode 100644 index 0000000000..a3fa6235c0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fesetround.c @@ -0,0 +1,54 @@ +/* Set current rounding direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + +int +__fesetround (int round) +{ + unsigned short int cw; + + if ((round & ~0xc00) != 0) + /* ROUND is no valid rounding mode. */ + return 1; + + __asm__ ("fnstcw %0" : "=m" (*&cw)); + cw &= ~0xc00; + cw |= round; + __asm__ ("fldcw %0" : : "m" (*&cw)); + + /* If the CPU supports SSE we set the MXCSR as well. */ + if (HAS_CPU_FEATURE (SSE)) + { + unsigned int xcw; + + __asm__ ("stmxcsr %0" : "=m" (*&xcw)); + xcw &= ~0x6000; + xcw |= round << 3; + __asm__ ("ldmxcsr %0" : : "m" (*&xcw)); + } + + return 0; +} +libm_hidden_def (__fesetround) +weak_alias (__fesetround, fesetround) +libm_hidden_weak (fesetround) diff --git a/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c b/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c new file mode 100644 index 0000000000..b610289cd0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/feupdateenv.c @@ -0,0 +1,60 @@ +/* Install given floating-point environment and raise exceptions. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <unistd.h> +#include <dl-procinfo.h> +#include <ldsodefs.h> + +int +__feupdateenv (const fenv_t *envp) +{ + fexcept_t temp; + unsigned int xtemp = 0; + + /* Save current exceptions. */ + __asm__ ("fnstsw %0" : "=m" (*&temp)); + + /* If the CPU supports SSE we test the MXCSR as well. */ + if (HAS_CPU_FEATURE (SSE)) + __asm__ ("stmxcsr %0" : "=m" (*&xtemp)); + + temp = (temp | xtemp) & FE_ALL_EXCEPT; + + /* Install new environment. */ + __fesetenv (envp); + + /* Raise the saved exception. Incidently for us the implementation + defined format of the values in objects of type fexcept_t is the + same as the ones specified using the FE_* constants. */ + __feraiseexcept ((int) temp); + + /* Success. */ + return 0; +} + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2) +strong_alias (__feupdateenv, __old_feupdateenv) +compat_symbol (libm, __old_feupdateenv, feupdateenv, GLIBC_2_1); +#endif + +libm_hidden_def (__feupdateenv) +libm_hidden_ver (__feupdateenv, feupdateenv) +versioned_symbol (libm, __feupdateenv, feupdateenv, GLIBC_2_2); diff --git a/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c b/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c new file mode 100644 index 0000000000..954e5f69d8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fgetexcptflg.c @@ -0,0 +1,57 @@ +/* Store current representation for exceptions. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + + +int +__fegetexceptflag (fexcept_t *flagp, int excepts) +{ + fexcept_t temp; + + /* Get the current exceptions. */ + __asm__ ("fnstsw %0" : "=m" (*&temp)); + + *flagp = temp & excepts & FE_ALL_EXCEPT; + + /* If the CPU supports SSE, we clear the MXCSR as well. */ + if (HAS_CPU_FEATURE (SSE)) + { + unsigned int sse_exc; + + /* Get the current MXCSR. */ + __asm__ ("stmxcsr %0" : "=m" (*&sse_exc)); + + *flagp |= sse_exc & excepts & FE_ALL_EXCEPT; + } + + /* Success. */ + return 0; +} + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2) +strong_alias (__fegetexceptflag, __old_fegetexceptflag) +compat_symbol (libm, __old_fegetexceptflag, fegetexceptflag, GLIBC_2_1); +#endif + +versioned_symbol (libm, __fegetexceptflag, fegetexceptflag, GLIBC_2_2); diff --git a/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c b/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c new file mode 100644 index 0000000000..913d7b912c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fraiseexcpt.c @@ -0,0 +1,124 @@ +/* Raise given exceptions. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <math.h> + +int +__feraiseexcept (int excepts) +{ + /* Raise exceptions represented by EXPECTS. But we must raise only + one signal at a time. It is important that if the overflow/underflow + exception and the inexact exception are given at the same time, + the overflow/underflow exception follows the inexact exception. */ + + /* First: invalid exception. */ + if ((FE_INVALID & excepts) != 0) + { + /* One example of an invalid operation is 0.0 / 0.0. */ + double d; + __asm__ __volatile__ ("fldz; fdiv %%st, %%st(0); fwait" : "=t" (d)); + (void) &d; + } + + /* Next: division by zero. */ + if ((FE_DIVBYZERO & excepts) != 0) + { + double d; + __asm__ __volatile__ ("fldz; fld1; fdivp %%st, %%st(1); fwait" + : "=t" (d)); + (void) &d; + } + + /* Next: overflow. */ + if ((FE_OVERFLOW & excepts) != 0) + { + /* There is no way to raise only the overflow flag. Do it the + hard way. */ + fenv_t temp; + + /* Bah, we have to clear selected exceptions. Since there is no + `fldsw' instruction we have to do it the hard way. */ + __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp)); + + /* Set the relevant bits. */ + temp.__status_word |= FE_OVERFLOW; + + /* Put the new data in effect. */ + __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp)); + + /* And raise the exception. */ + __asm__ __volatile__ ("fwait"); + } + + /* Next: underflow. */ + if ((FE_UNDERFLOW & excepts) != 0) + { + /* There is no way to raise only the underflow flag. Do it the + hard way. */ + fenv_t temp; + + /* Bah, we have to clear selected exceptions. Since there is no + `fldsw' instruction we have to do it the hard way. */ + __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp)); + + /* Set the relevant bits. */ + temp.__status_word |= FE_UNDERFLOW; + + /* Put the new data in effect. */ + __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp)); + + /* And raise the exception. */ + __asm__ __volatile__ ("fwait"); + } + + /* Last: inexact. */ + if ((FE_INEXACT & excepts) != 0) + { + /* There is no way to raise only the inexact flag. Do it the + hard way. */ + fenv_t temp; + + /* Bah, we have to clear selected exceptions. Since there is no + `fldsw' instruction we have to do it the hard way. */ + __asm__ __volatile__ ("fnstenv %0" : "=m" (*&temp)); + + /* Set the relevant bits. */ + temp.__status_word |= FE_INEXACT; + + /* Put the new data in effect. */ + __asm__ __volatile__ ("fldenv %0" : : "m" (*&temp)); + + /* And raise the exception. */ + __asm__ __volatile__ ("fwait"); + } + + /* Success. */ + return 0; +} + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2) +strong_alias (__feraiseexcept, __old_feraiseexcept) +compat_symbol (libm, __old_feraiseexcept, feraiseexcept, GLIBC_2_1); +#endif + +libm_hidden_def (__feraiseexcept) +libm_hidden_ver (__feraiseexcept, feraiseexcept) +versioned_symbol (libm, __feraiseexcept, feraiseexcept, GLIBC_2_2); diff --git a/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c b/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c new file mode 100644 index 0000000000..efa64aaefd --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/fsetexcptflg.c @@ -0,0 +1,69 @@ +/* Set floating-point environment exception handling. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <math.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + +int +__fesetexceptflag (const fexcept_t *flagp, int excepts) +{ + fenv_t temp; + + /* Get the current environment. We have to do this since we cannot + separately set the status word. */ + __asm__ ("fnstenv %0" : "=m" (*&temp)); + + temp.__status_word &= ~(excepts & FE_ALL_EXCEPT); + temp.__status_word |= *flagp & excepts & FE_ALL_EXCEPT; + + /* Store the new status word (along with the rest of the environment. + Possibly new exceptions are set but they won't get executed unless + the next floating-point instruction. */ + __asm__ ("fldenv %0" : : "m" (*&temp)); + + /* If the CPU supports SSE, we set the MXCSR as well. */ + if (HAS_CPU_FEATURE (SSE)) + { + unsigned int xnew_exc; + + /* Get the current MXCSR. */ + __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc)); + + /* Set the relevant bits. */ + xnew_exc &= ~(excepts & FE_ALL_EXCEPT); + xnew_exc |= *flagp & excepts & FE_ALL_EXCEPT; + + /* Put the new data in effect. */ + __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc)); + } + + /* Success. */ + return 0; +} + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2) +strong_alias (__fesetexceptflag, __old_fesetexceptflag) +compat_symbol (libm, __old_fesetexceptflag, fesetexceptflag, GLIBC_2_1); +#endif + +versioned_symbol (libm, __fesetexceptflag, fesetexceptflag, GLIBC_2_2); diff --git a/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c b/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c new file mode 100644 index 0000000000..f523f9e709 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/ftestexcept.c @@ -0,0 +1,40 @@ +/* Test exception in current environment. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <fenv.h> +#include <unistd.h> +#include <dl-procinfo.h> +#include <ldsodefs.h> + +int +fetestexcept (int excepts) +{ + short temp; + int xtemp = 0; + + /* Get current exceptions. */ + __asm__ ("fnstsw %0" : "=a" (temp)); + + /* If the CPU supports SSE we test the MXCSR as well. */ + if (HAS_CPU_FEATURE (SSE)) + __asm__ ("stmxcsr %0" : "=m" (*&xtemp)); + + return (temp | xtemp) & excepts & FE_ALL_EXCEPT; +} +libm_hidden_def (fetestexcept) diff --git a/REORG.TODO/sysdeps/i386/fpu/halfulp.c b/REORG.TODO/sysdeps/i386/fpu/halfulp.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/halfulp.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h b/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h new file mode 100644 index 0000000000..6ffc8e6f64 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/i386-math-asm.h @@ -0,0 +1,340 @@ +/* Helper macros for x86 libm functions. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _I386_MATH_ASM_H +#define _I386_MATH_ASM_H 1 + +/* Remove excess range and precision by storing a value on the stack + and loading it back. */ +#define FLT_NARROW_EVAL \ + subl $4, %esp; \ + cfi_adjust_cfa_offset (4); \ + fstps (%esp); \ + flds (%esp); \ + addl $4, %esp; \ + cfi_adjust_cfa_offset (-4); +#define DBL_NARROW_EVAL \ + subl $8, %esp; \ + cfi_adjust_cfa_offset (8); \ + fstpl (%esp); \ + fldl (%esp); \ + addl $8, %esp; \ + cfi_adjust_cfa_offset (-8); + +/* Define constants for the minimum value of a floating-point + type. */ +#define DEFINE_FLT_MIN \ + .section .rodata.cst4,"aM",@progbits,4; \ + .p2align 2; \ + .type flt_min,@object; \ +flt_min: \ + .byte 0, 0, 0x80, 0; \ + .size flt_min, .-flt_min; +#define DEFINE_DBL_MIN \ + .section .rodata.cst8,"aM",@progbits,8; \ + .p2align 3; \ + .type dbl_min,@object; \ +dbl_min: \ + .byte 0, 0, 0, 0, 0, 0, 0x10, 0; \ + .size dbl_min, .-dbl_min; +#define DEFINE_LDBL_MIN \ + .section .rodata.cst16,"aM",@progbits,16; \ + .p2align 4; \ + .type ldbl_min,@object; \ +ldbl_min: \ + .byte 0, 0, 0, 0, 0, 0, 0, 0x80, 0x1, 0; \ + .byte 0, 0, 0, 0, 0, 0; \ + .size ldbl_min, .-ldbl_min; + +/* Remove excess range and precision by storing a value on the stack + and loading it back. The value is given to be nonnegative or NaN; + if it is subnormal, also force an underflow exception. The + relevant constant for the minimum of the type must have been + defined, the MO macro must have been defined for access to memory + operands, and, if PIC, the PIC register must have been loaded. */ +#define FLT_NARROW_EVAL_UFLOW_NONNEG_NAN \ + subl $4, %esp; \ + cfi_adjust_cfa_offset (4); \ + flds MO(flt_min); \ + fld %st(1); \ + fucompp; \ + fnstsw; \ + sahf; \ + jnc 6424f; \ + fld %st(0); \ + fmul %st(0); \ + fstps (%esp); \ +6424: fstps (%esp); \ + flds (%esp); \ + addl $4, %esp; \ + cfi_adjust_cfa_offset (-4); +#define DBL_NARROW_EVAL_UFLOW_NONNEG_NAN \ + subl $8, %esp; \ + cfi_adjust_cfa_offset (8); \ + fldl MO(dbl_min); \ + fld %st(1); \ + fucompp; \ + fnstsw; \ + sahf; \ + jnc 6453f; \ + fld %st(0); \ + fmul %st(0); \ + fstpl (%esp); \ +6453: fstpl (%esp); \ + fldl (%esp); \ + addl $8, %esp; \ + cfi_adjust_cfa_offset (-8); + +/* Likewise, but the argument is not a NaN (so fcom instructions, + which support memory operands, can be used). */ +#define FLT_NARROW_EVAL_UFLOW_NONNEG \ + subl $4, %esp; \ + cfi_adjust_cfa_offset (4); \ + fcoms MO(flt_min); \ + fnstsw; \ + sahf; \ + jnc 6424f; \ + fld %st(0); \ + fmul %st(0); \ + fstps (%esp); \ +6424: fstps (%esp); \ + flds (%esp); \ + addl $4, %esp; \ + cfi_adjust_cfa_offset (-4); +#define DBL_NARROW_EVAL_UFLOW_NONNEG \ + subl $8, %esp; \ + cfi_adjust_cfa_offset (8); \ + fcoml MO(dbl_min); \ + fnstsw; \ + sahf; \ + jnc 6453f; \ + fld %st(0); \ + fmul %st(0); \ + fstpl (%esp); \ +6453: fstpl (%esp); \ + fldl (%esp); \ + addl $8, %esp; \ + cfi_adjust_cfa_offset (-8); + +/* Likewise, but the non-NaN argument may be negative. */ +#define FLT_NARROW_EVAL_UFLOW_NONNAN \ + subl $4, %esp; \ + cfi_adjust_cfa_offset (4); \ + fld %st(0); \ + fabs; \ + fcomps MO(flt_min); \ + fnstsw; \ + sahf; \ + jnc 6424f; \ + fld %st(0); \ + fmul %st(0); \ + fstps (%esp); \ +6424: fstps (%esp); \ + flds (%esp); \ + addl $4, %esp; \ + cfi_adjust_cfa_offset (-4); +#define DBL_NARROW_EVAL_UFLOW_NONNAN \ + subl $8, %esp; \ + cfi_adjust_cfa_offset (8); \ + fld %st(0); \ + fabs; \ + fcompl MO(dbl_min); \ + fnstsw; \ + sahf; \ + jnc 6453f; \ + fld %st(0); \ + fmul %st(0); \ + fstpl (%esp); \ +6453: fstpl (%esp); \ + fldl (%esp); \ + addl $8, %esp; \ + cfi_adjust_cfa_offset (-8); + +/* Force an underflow exception if the given value is subnormal. The + relevant constant for the minimum of the type must have been + defined, the MO macro must have been defined for access to memory + operands, and, if PIC, the PIC register must have been loaded. */ +#define FLT_CHECK_FORCE_UFLOW \ + flds MO(flt_min); \ + fld %st(1); \ + fabs; \ + fucompp; \ + fnstsw; \ + sahf; \ + jnc 6424f; \ + subl $4, %esp; \ + cfi_adjust_cfa_offset (4); \ + fld %st(0); \ + fmul %st(0); \ + fstps (%esp); \ + addl $4, %esp; \ + cfi_adjust_cfa_offset (-4); \ +6424: +#define DBL_CHECK_FORCE_UFLOW \ + fldl MO(dbl_min); \ + fld %st(1); \ + fabs; \ + fucompp; \ + fnstsw; \ + sahf; \ + jnc 6453f; \ + subl $8, %esp; \ + cfi_adjust_cfa_offset (8); \ + fld %st(0); \ + fmul %st(0); \ + fstpl (%esp); \ + addl $8, %esp; \ + cfi_adjust_cfa_offset (-8); \ +6453: + +/* Likewise, but also remove excess range and precision if the value + is subnormal. */ +#define FLT_CHECK_FORCE_UFLOW_NARROW \ + flds MO(flt_min); \ + fld %st(1); \ + fabs; \ + fucompp; \ + fnstsw; \ + sahf; \ + jnc 6424f; \ + subl $4, %esp; \ + cfi_adjust_cfa_offset (4); \ + fld %st(0); \ + fmul %st(0); \ + fstps (%esp); \ + fstps (%esp); \ + flds (%esp); \ + addl $4, %esp; \ + cfi_adjust_cfa_offset (-4); \ +6424: +#define DBL_CHECK_FORCE_UFLOW_NARROW \ + fldl MO(dbl_min); \ + fld %st(1); \ + fabs; \ + fucompp; \ + fnstsw; \ + sahf; \ + jnc 6453f; \ + subl $8, %esp; \ + cfi_adjust_cfa_offset (8); \ + fld %st(0); \ + fmul %st(0); \ + fstpl (%esp); \ + fstpl (%esp); \ + fldl (%esp); \ + addl $8, %esp; \ + cfi_adjust_cfa_offset (-8); \ +6453: + +/* Likewise, but the argument is nonnegative or NaN. */ +#define LDBL_CHECK_FORCE_UFLOW_NONNEG_NAN \ + fldt MO(ldbl_min); \ + fld %st(1); \ + fucompp; \ + fnstsw; \ + sahf; \ + jnc 6464f; \ + fld %st(0); \ + fmul %st(0); \ + fstp %st(0); \ +6464: + +/* Likewise, but the argument is not a NaN. */ +#define FLT_CHECK_FORCE_UFLOW_NONNAN \ + fld %st(0); \ + fabs; \ + fcomps MO(flt_min); \ + fnstsw; \ + sahf; \ + jnc 6424f; \ + subl $4, %esp; \ + cfi_adjust_cfa_offset (4); \ + fld %st(0); \ + fmul %st(0); \ + fstps (%esp); \ + addl $4, %esp; \ + cfi_adjust_cfa_offset (-4); \ +6424: +#define DBL_CHECK_FORCE_UFLOW_NONNAN \ + fld %st(0); \ + fabs; \ + fcompl MO(dbl_min); \ + fnstsw; \ + sahf; \ + jnc 6453f; \ + subl $8, %esp; \ + cfi_adjust_cfa_offset (8); \ + fld %st(0); \ + fmul %st(0); \ + fstpl (%esp); \ + addl $8, %esp; \ + cfi_adjust_cfa_offset (-8); \ +6453: +#define LDBL_CHECK_FORCE_UFLOW_NONNAN \ + fldt MO(ldbl_min); \ + fld %st(1); \ + fabs; \ + fcompp; \ + fnstsw; \ + sahf; \ + jnc 6464f; \ + fld %st(0); \ + fmul %st(0); \ + fstp %st(0); \ +6464: + +/* Likewise, but the argument is nonnegative and not a NaN. */ +#define FLT_CHECK_FORCE_UFLOW_NONNEG \ + fcoms MO(flt_min); \ + fnstsw; \ + sahf; \ + jnc 6424f; \ + subl $4, %esp; \ + cfi_adjust_cfa_offset (4); \ + fld %st(0); \ + fmul %st(0); \ + fstps (%esp); \ + addl $4, %esp; \ + cfi_adjust_cfa_offset (-4); \ +6424: +#define DBL_CHECK_FORCE_UFLOW_NONNEG \ + fcoml MO(dbl_min); \ + fnstsw; \ + sahf; \ + jnc 6453f; \ + subl $8, %esp; \ + cfi_adjust_cfa_offset (8); \ + fld %st(0); \ + fmul %st(0); \ + fstpl (%esp); \ + addl $8, %esp; \ + cfi_adjust_cfa_offset (-8); \ +6453: +#define LDBL_CHECK_FORCE_UFLOW_NONNEG \ + fldt MO(ldbl_min); \ + fld %st(1); \ + fcompp; \ + fnstsw; \ + sahf; \ + jnc 6464f; \ + fld %st(0); \ + fmul %st(0); \ + fstp %st(0); \ +6464: + +#endif /* i386-math-asm.h. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps new file mode 100644 index 0000000000..0fc50907ad --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps @@ -0,0 +1,2202 @@ +# Begin of automatic generation + +# Maximal error of functions: +Function: "acos": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "acos_downward": +ildouble: 2 +ldouble: 2 + +Function: "acos_towardzero": +ildouble: 2 +ldouble: 2 + +Function: "acos_upward": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "acosh": +double: 1 +idouble: 1 +ildouble: 4 +ldouble: 2 + +Function: "acosh_downward": +double: 1 +idouble: 1 +ildouble: 6 +ldouble: 4 + +Function: "acosh_towardzero": +double: 1 +idouble: 1 +ildouble: 6 +ldouble: 4 + +Function: "acosh_upward": +double: 1 +idouble: 1 +ildouble: 4 +ldouble: 3 + +Function: "asin": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "asin_downward": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "asin_towardzero": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "asin_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "asinh": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "asinh_downward": +double: 1 +float: 1 +idouble: 1 +ildouble: 5 +ldouble: 5 + +Function: "asinh_towardzero": +double: 1 +float: 1 +idouble: 1 +ildouble: 4 +ldouble: 4 + +Function: "asinh_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: "atan": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan2": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan2_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan2_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan2_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atanh": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "atanh_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 4 + +Function: "atanh_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 3 + +Function: "atanh_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: "cabs": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "cabs_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "cabs_towardzero": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "cabs_upward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cacos": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "cacos": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cacos_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacos_downward": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 6 +ldouble: 6 + +Function: Real part of "cacos_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacos_towardzero": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Real part of "cacos_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacos_upward": +double: 7 +float: 7 +idouble: 7 +ifloat: 7 +ildouble: 7 +ldouble: 7 + +Function: Real part of "cacosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cacosh_downward": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "cacosh_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "cacosh_towardzero": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "cacosh_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cacosh_upward": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "cacosh_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "carg": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "carg_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "carg_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "carg_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "casin": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "casin": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "casin_downward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "casin_downward": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 6 +ldouble: 6 + +Function: Real part of "casin_towardzero": +double: 3 +float: 1 +idouble: 3 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "casin_towardzero": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Real part of "casin_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "casin_upward": +double: 7 +float: 7 +idouble: 7 +ifloat: 7 +ildouble: 7 +ldouble: 7 + +Function: Real part of "casinh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "casinh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "casinh_downward": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 6 +ldouble: 6 + +Function: Imaginary part of "casinh_downward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "casinh_towardzero": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "casinh_towardzero": +double: 3 +float: 1 +idouble: 3 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "casinh_upward": +double: 7 +float: 7 +idouble: 7 +ifloat: 7 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "casinh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "catan": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catan_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan_downward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "catan_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "catan_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "catanh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catanh": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catanh_downward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "catanh_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catanh_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "catanh_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catanh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "catanh_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "cbrt": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "cbrt_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "cbrt_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "cbrt_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ccos": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "ccos": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "ccos_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccos_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccos_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccos_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccos_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "ccos_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "ccosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "ccosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "ccosh_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccosh_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccosh_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccosh_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccosh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "ccosh_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cexp": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "cexp": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cexp_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "cexp_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "cexp_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "cexp_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "cexp_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cexp_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "clog": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "clog10": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "clog10": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "clog10_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 8 +ldouble: 8 + +Function: Imaginary part of "clog10_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog10_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 8 +ldouble: 8 + +Function: Imaginary part of "clog10_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog10_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "clog10_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "clog_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "clog_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "clog_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "clog_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "clog_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "cos": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "cos_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "cos_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "cos_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "cosh": +double: 1 +float: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "cosh_downward": +double: 2 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 3 + +Function: "cosh_towardzero": +double: 2 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "cosh_upward": +double: 4 +float: 2 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 3 + +Function: Real part of "cpow": +double: 2 +float: 5 +idouble: 2 +ifloat: 5 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "cpow": +float: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Real part of "cpow_downward": +double: 5 +float: 8 +idouble: 5 +ifloat: 8 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "cpow_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cpow_towardzero": +double: 5 +float: 8 +idouble: 5 +ifloat: 8 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "cpow_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cpow_upward": +double: 4 +float: 1 +idouble: 4 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cpow_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "csin": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "csin": +float: 1 +ifloat: 1 + +Function: Real part of "csin_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csin_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csin_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csin_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csin_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "csin_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "csinh": +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "csinh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "csinh_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csinh_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csinh_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csinh_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csinh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "csinh_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "csqrt": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "csqrt": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "csqrt_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "csqrt_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "csqrt_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "csqrt_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "csqrt_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "csqrt_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctan": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "ctan": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "ctan_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "ctan_downward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctan_towardzero": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "ctan_towardzero": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctan_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ctan_upward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ctanh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "ctanh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "ctanh_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "ctanh_downward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctanh_towardzero": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "ctanh_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ctanh_upward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ctanh_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "erf": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erf_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erf_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erf_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erfc": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "erfc_downward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "erfc_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "erfc_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "exp": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp10": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp10_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp10_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp10_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp2": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp2_downward": +ildouble: 1 +ldouble: 1 + +Function: "exp2_towardzero": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp2_upward": +ildouble: 1 +ldouble: 1 + +Function: "exp_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp_upward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "expm1": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "expm1_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "expm1_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "expm1_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "gamma": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "gamma_downward": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 7 +ldouble: 7 + +Function: "gamma_towardzero": +double: 4 +float: 2 +idouble: 4 +ifloat: 2 +ildouble: 7 +ldouble: 7 + +Function: "gamma_upward": +double: 3 +float: 4 +idouble: 3 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: "hypot": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "hypot_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "hypot_towardzero": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "hypot_upward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "j0": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "j0_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "j0_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: "j0_upward": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "j1": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "j1_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "j1_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "j1_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: "jn": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "jn_downward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "jn_towardzero": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "jn_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "lgamma": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "lgamma_downward": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 7 +ldouble: 7 + +Function: "lgamma_towardzero": +double: 4 +float: 2 +idouble: 4 +ifloat: 2 +ildouble: 7 +ldouble: 7 + +Function: "lgamma_upward": +double: 3 +float: 4 +idouble: 3 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: "log": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "log10": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "log10_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "log10_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "log10_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "log1p": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "log1p_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "log1p_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "log1p_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "log2": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "log2_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "log2_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "log2_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "log_downward": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "log_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "log_upward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "pow": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "pow10": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "pow10_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "pow10_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "pow10_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "pow_downward": +double: 1 +idouble: 1 +ildouble: 4 +ldouble: 4 + +Function: "pow_towardzero": +double: 1 +idouble: 1 +ildouble: 4 +ldouble: 4 + +Function: "pow_upward": +double: 1 +idouble: 1 +ildouble: 4 +ldouble: 4 + +Function: "sin": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "sin_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "sin_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "sin_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "sincos": +float: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "sincos_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "sincos_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "sincos_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "sinh": +double: 1 +ildouble: 2 +ldouble: 2 + +Function: "sinh_downward": +double: 2 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 5 + +Function: "sinh_towardzero": +double: 2 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 4 + +Function: "sinh_upward": +double: 4 +float: 2 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 5 + +Function: "tan": +float: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "tan_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "tan_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "tan_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "tanh": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "tanh_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 7 +ldouble: 4 + +Function: "tanh_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "tanh_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 4 + +Function: "tgamma": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "tgamma_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "tgamma_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "tgamma_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "y0": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "y0_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: "y0_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: "y0_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "y1": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "y1_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 7 +ldouble: 7 + +Function: "y1_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: "y1_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 7 +ldouble: 7 + +Function: "yn": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "yn_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: "yn_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "yn_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +# end of automatic generation diff --git a/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name new file mode 100644 index 0000000000..54ca0d8295 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/libm-test-ulps-name @@ -0,0 +1 @@ +ix86 diff --git a/REORG.TODO/sysdeps/i386/fpu/math-tests.h b/REORG.TODO/sysdeps/i386/fpu/math-tests.h new file mode 100644 index 0000000000..26d0633dc0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/math-tests.h @@ -0,0 +1,27 @@ +/* Configuration for math tests. 32-bit x86 version. + Copyright (C) 2013-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* On 32-bit x86, versions of GCC up to at least 4.8 are happy to use FPU load + instructions for sNaN values, and loading a float or double sNaN value will + already raise an INVALID exception as well as turn the sNaN into a qNaN, + rendering certain tests infeasible in this scenario. + <http://gcc.gnu.org/PR56831>. */ +#define SNAN_TESTS_float 0 +#define SNAN_TESTS_double 0 + +#include_next <math-tests.h> diff --git a/REORG.TODO/sysdeps/i386/fpu/math_private.h b/REORG.TODO/sysdeps/i386/fpu/math_private.h new file mode 100644 index 0000000000..485214391f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/math_private.h @@ -0,0 +1,7 @@ +#ifndef I386_MATH_PRIVATE_H +#define I386_MATH_PRIVATE_H 1 + +#include "fenv_private.h" +#include_next <math_private.h> + +#endif diff --git a/REORG.TODO/sysdeps/i386/fpu/mpatan.c b/REORG.TODO/sysdeps/i386/fpu/mpatan.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/mpatan.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/mpatan2.c b/REORG.TODO/sysdeps/i386/fpu/mpatan2.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/mpatan2.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/mpexp.c b/REORG.TODO/sysdeps/i386/fpu/mpexp.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/mpexp.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/mplog.c b/REORG.TODO/sysdeps/i386/fpu/mplog.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/mplog.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c b/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/mpsqrt.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinh.S b/REORG.TODO/sysdeps/i386/fpu/s_asinh.S new file mode 100644 index 0000000000..1a60f7de2c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_asinh.S @@ -0,0 +1,139 @@ +/* ix87 specific implementation of arcsinh. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type huge,@object +huge: .double 1e+300 + ASM_SIZE_DIRECTIVE(huge) + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__asinh) + movl 8(%esp), %ecx + movl $0x7fffffff, %eax + andl %ecx, %eax + andl $0x80000000, %ecx + movl %eax, %edx + orl $0x800fffff, %edx + incl %edx + jz 7f // x in ±Inf or NaN + xorl %ecx, 8(%esp) + fldl 4(%esp) // |x| + cmpl $0x3e300000, %eax + jb 2f // |x| < 2^-28 + fldln2 // log(2) : |x| + cmpl $0x41b00000, %eax + fxch // |x| : log(2) + ja 3f // |x| > 2^28 +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + cmpl $0x40000000, %eax + ja 5f // |x| > 2 + + // 2^-28 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2))) + fld %st // |x| : |x| : log(2) + fmul %st(1) // |x|^2 : |x| : log(2) + fld %st // |x|^2 : |x|^2 : |x| : log(2) + faddl MO(one) // 1+|x|^2 : |x|^2 : |x| : log(2) + fsqrt // sqrt(1+|x|^2) : |x|^2 : |x| : log(2) + faddl MO(one) // 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2) + fdivrp // |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2) + faddp // |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2) + fcoml MO(limit) + fnstsw + sahf + ja 6f + fyl2xp1 + jecxz 4f + fchs +4: ret + +7: fldl 4(%esp) + ret + +6: faddl MO(one) + fyl2x + jecxz 4f + fchs +4: ret + + // |x| < 2^-28 => y = x (inexact iff |x| != 0.0) + .align ALIGNARG(4) +2: +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + jecxz 4f + fchs // x +4: fld %st // x : x + faddl MO(huge) // huge+x : x + fstp %st(0) // x + cmpl $0x00100000, %eax + jae 8f + subl $8, %esp + cfi_adjust_cfa_offset (8) + fld %st(0) + fmul %st(0) + fstpl (%esp) + addl $8, %esp + cfi_adjust_cfa_offset (-8) +8: ret + + // |x| > 2^28 => y = sign(x) * (log(|x|) + log(2)) + .align ALIGNARG(4) +3: fyl2x // log(|x|) + fldln2 // log(2) : log(|x|) + faddp // log(|x|)+log(2) + jecxz 4f + fchs +4: ret + + // |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1))) + .align ALIGNARG(4) +5: fld %st // |x| : |x| : log(2) + fadd %st, %st(1) // |x| : 2*|x| : log(2) + fld %st // |x| : |x| : 2*|x| : log(2) + fmul %st(1) // |x|^2 : |x| : 2*|x| : log(2) + faddl MO(one) // 1+|x|^2 : |x| : 2*|x| : log(2) + fsqrt // sqrt(1+|x|^2) : |x| : 2*|x| : log(2) + faddp // |x|+sqrt(1+|x|^2) : 2*|x| : log(2) + fdivrl MO(one) // 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2) + faddp // 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2) + fyl2x // log(2*|x|+1/(|x|+sqrt(1+|x|^2))) + jecxz 4f + fchs +4: ret +END(__asinh) +weak_alias (__asinh, asinh) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S b/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S new file mode 100644 index 0000000000..12bcfef934 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_asinhf.S @@ -0,0 +1,139 @@ +/* ix87 specific implementation of arcsinh. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type huge,@object +huge: .double 1e+36 + ASM_SIZE_DIRECTIVE(huge) + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__asinhf) + movl 4(%esp), %ecx + movl $0x7fffffff, %eax + andl %ecx, %eax + andl $0x80000000, %ecx + movl %eax, %edx + orl $0x807fffff, %edx + incl %edx + jz 7f // x in ±Inf or NaN + xorl %ecx, 4(%esp) + flds 4(%esp) // |x| + cmpl $0x38000000, %eax + jb 2f // |x| < 2^-14 + fldln2 // log(2) : |x| + cmpl $0x47000000, %eax + fxch // |x| : log(2) + ja 3f // |x| > 2^14 +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + cmpl $0x40000000, %eax + ja 5f // |x| > 2 + + // 2^-14 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2))) + fld %st // |x| : |x| : log(2) + fmul %st(1) // |x|^2 : |x| : log(2) + fld %st // |x|^2 : |x|^2 : |x| : log(2) + faddl MO(one) // 1+|x|^2 : |x|^2 : |x| : log(2) + fsqrt // sqrt(1+|x|^2) : |x|^2 : |x| : log(2) + faddl MO(one) // 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2) + fdivrp // |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2) + faddp // |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2) + fcoml MO(limit) + fnstsw + sahf + ja 6f + fyl2xp1 + jecxz 4f + fchs +4: ret + +7: flds 4(%esp) + ret + +6: faddl MO(one) + fyl2x + jecxz 4f + fchs +4: ret + + // |x| < 2^-14 => y = x (inexact iff |x| != 0.0) + .align ALIGNARG(4) +2: +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + jecxz 4f + fchs // x +4: fld %st // x : x + faddl MO(huge) // huge+x : x + fstp %st(0) // x + cmpl $0x00800000, %eax + jae 8f + subl $4, %esp + cfi_adjust_cfa_offset (4) + fld %st(0) + fmul %st(0) + fstps (%esp) + addl $4, %esp + cfi_adjust_cfa_offset (-4) +8: ret + + // |x| > 2^14 => y = sign(x) * (log(|x|) + log(2)) + .align ALIGNARG(4) +3: fyl2x // log(|x|) + fldln2 // log(2) : log(|x|) + faddp // log(|x|)+log(2) + jecxz 4f + fchs +4: ret + + // |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1))) + .align ALIGNARG(4) +5: fld %st // |x| : |x| : log(2) + fadd %st, %st(1) // |x| : 2*|x| : log(2) + fld %st // |x| : |x| : 2*|x| : log(2) + fmul %st(1) // |x|^2 : |x| : 2*|x| : log(2) + faddl MO(one) // 1+|x|^2 : |x| : 2*|x| : log(2) + fsqrt // sqrt(1+|x|^2) : |x| : 2*|x| : log(2) + faddp // |x|+sqrt(1+|x|^2) : 2*|x| : log(2) + fdivrl MO(one) // 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2) + faddp // 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2) + fyl2x // log(2*|x|+1/(|x|+sqrt(1+|x|^2))) + jecxz 4f + fchs +4: ret +END(__asinhf) +weak_alias (__asinhf, asinhf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S b/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S new file mode 100644 index 0000000000..f31a267e78 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_asinhl.S @@ -0,0 +1,144 @@ +/* ix87 specific implementation of arcsinh. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type huge,@object +huge: .tfloat 1e+4930 + ASM_SIZE_DIRECTIVE(huge) + .align ALIGNARG(4) + /* Please note that we use double value for 1.0. This number + has an exact representation and so we don't get accuracy + problems. The advantage is that the code is simpler. */ + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__asinhl) + movl 12(%esp), %ecx + movl $0x7fff, %eax + andl %ecx, %eax + andl $0x8000, %ecx + movl %eax, %edx + orl $0xffff8000, %edx + incl %edx + jz 7f // x in ±Inf or NaN + xorl %ecx, 12(%esp) + fldt 4(%esp) // |x| + cmpl $0x3fde, %eax + jb 2f // |x| < 2^-34 + fldln2 // log(2) : |x| + cmpl $0x4020, %eax + fxch // |x| : log(2) + ja 3f // |x| > 2^34 +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + cmpl $0x4000, %eax + ja 5f // |x| > 2 + + // 2^-34 <= |x| <= 2 => y = sign(x)*log1p(|x|+|x|^2/(1+sqrt(1+|x|^2))) + fld %st // |x| : |x| : log(2) + fmul %st(1) // |x|^2 : |x| : log(2) + fld %st // |x|^2 : |x|^2 : |x| : log(2) + faddl MO(one) // 1+|x|^2 : |x|^2 : |x| : log(2) + fsqrt // sqrt(1+|x|^2) : |x|^2 : |x| : log(2) + faddl MO(one) // 1+sqrt(1+|x|^2) : |x|^2 : |x| : log(2) + fdivrp // |x|^2/(1+sqrt(1+|x|^2)) : |x| : log(2) + faddp // |x|+|x|^2/(1+sqrt(1+|x|^2)) : log(2) + fcoml MO(limit) + fnstsw + sahf + ja 6f + fyl2xp1 + jecxz 4f + fchs +4: ret + +7: fldt 4(%esp) + fadd %st + ret + +6: faddl MO(one) + fyl2x + jecxz 4f + fchs +4: ret + + // |x| < 2^-34 => y = x (inexact iff |x| != 0.0) + .align ALIGNARG(4) +2: +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + jecxz 4f + fchs // x +4: fld %st // x : x + fldt MO(huge) // huge : x : x + faddp // huge+x : x + fstp %st(0) // x + cmpl $0x0001, %eax + jae 8f + fld %st(0) + fmul %st(0) + fstp %st(0) +8: ret + + // |x| > 2^34 => y = sign(x) * (log(|x|) + log(2)) + .align ALIGNARG(4) +3: fyl2x // log(|x|) + fldln2 // log(2) : log(|x|) + faddp // log(|x|)+log(2) + jecxz 4f + fchs +4: ret + + // |x| > 2 => y = sign(x) * log(2*|x| + 1/(|x|+sqrt(x*x+1))) + .align ALIGNARG(4) +5: fld %st // |x| : |x| : log(2) + fadd %st, %st(1) // |x| : 2*|x| : log(2) + fld %st // |x| : |x| : 2*|x| : log(2) + fmul %st(1) // |x|^2 : |x| : 2*|x| : log(2) + faddl MO(one) // 1+|x|^2 : |x| : 2*|x| : log(2) + fsqrt // sqrt(1+|x|^2) : |x| : 2*|x| : log(2) + faddp // |x|+sqrt(1+|x|^2) : 2*|x| : log(2) + fdivrl MO(one) // 1/(|x|+sqrt(1+|x|^2)) : 2*|x| : log(2) + faddp // 2*|x|+1/(|x|+sqrt(1+|x|^2)) : log(2) + fyl2x // log(2*|x|+1/(|x|+sqrt(1+|x|^2))) + jecxz 4f + fchs +4: ret +END(__asinhl) +weak_alias (__asinhl, asinhl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atan.S b/REORG.TODO/sysdeps/i386/fpu/s_atan.S new file mode 100644 index 0000000000..644de78feb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_atan.S @@ -0,0 +1,30 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +RCSID("$NetBSD: s_atan.S,v 1.4 1995/05/08 23:50:41 jtc Exp $") + +DEFINE_DBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__atan) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + fldl 4(%esp) + fld1 + fpatan + DBL_CHECK_FORCE_UFLOW + ret +END (__atan) +weak_alias (__atan, atan) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atanf.S b/REORG.TODO/sysdeps/i386/fpu/s_atanf.S new file mode 100644 index 0000000000..0589c1135e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_atanf.S @@ -0,0 +1,30 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +RCSID("$NetBSD: s_atanf.S,v 1.3 1995/05/08 23:51:33 jtc Exp $") + +DEFINE_FLT_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%ecx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__atanf) +#ifdef PIC + LOAD_PIC_REG (cx) +#endif + flds 4(%esp) + fld1 + fpatan + FLT_CHECK_FORCE_UFLOW + ret +END (__atanf) +weak_alias (__atanf, atanf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_atanl.c b/REORG.TODO/sysdeps/i386/fpu/s_atanl.c new file mode 100644 index 0000000000..b7dba88aad --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_atanl.c @@ -0,0 +1,22 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <math_private.h> + +long double +__atanl (long double x) +{ + long double res; + + asm ("fld1\n" + "fpatan" + : "=t" (res) : "0" (x)); + + return res; +} + +weak_alias (__atanl, atanl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S new file mode 100644 index 0000000000..7f01659eae --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrt.S @@ -0,0 +1,200 @@ +/* Compute cubic root of double value. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Dirk Alboth <dirka@uni-paderborn.de> and + Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type f7,@object +f7: .double -0.145263899385486377 + ASM_SIZE_DIRECTIVE(f7) + .type f6,@object +f6: .double 0.784932344976639262 + ASM_SIZE_DIRECTIVE(f6) + .type f5,@object +f5: .double -1.83469277483613086 + ASM_SIZE_DIRECTIVE(f5) + .type f4,@object +f4: .double 2.44693122563534430 + ASM_SIZE_DIRECTIVE(f4) + .type f3,@object +f3: .double -2.11499494167371287 + ASM_SIZE_DIRECTIVE(f3) + .type f2,@object +f2: .double 1.50819193781584896 + ASM_SIZE_DIRECTIVE(f2) + .type f1,@object +f1: .double 0.354895765043919860 + ASM_SIZE_DIRECTIVE(f1) + +#define CBRT2 1.2599210498948731648 +#define ONE_CBRT2 0.793700525984099737355196796584 +#define SQR_CBRT2 1.5874010519681994748 +#define ONE_SQR_CBRT2 0.629960524947436582364439673883 + + .type factor,@object +factor: .double ONE_SQR_CBRT2 + .double ONE_CBRT2 + .double 1.0 + .double CBRT2 + .double SQR_CBRT2 + ASM_SIZE_DIRECTIVE(factor) + + .type two54,@object +two54: .byte 0, 0, 0, 0, 0, 0, 0x50, 0x43 + ASM_SIZE_DIRECTIVE(two54) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%ebx) +#define MOX(op,x) op##@GOTOFF(%ebx,x,1) +#else +#define MO(op) op +#define MOX(op,x) op(x) +#endif + + .text +ENTRY(__cbrt) + movl 4(%esp), %ecx + movl 8(%esp), %eax + movl %eax, %edx + andl $0x7fffffff, %eax + orl %eax, %ecx + jz 1f + xorl %ecx, %ecx + cmpl $0x7ff00000, %eax + jae 1f + +#ifdef PIC + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + LOAD_PIC_REG (bx) +#endif + + cmpl $0x00100000, %eax + jae 2f + +#ifdef PIC + fldl 8(%esp) +#else + fldl 4(%esp) +#endif + fmull MO(two54) + movl $-54, %ecx +#ifdef PIC + fstpl 8(%esp) + movl 12(%esp), %eax +#else + fstpl 4(%esp) + movl 8(%esp), %eax +#endif + movl %eax, %edx + andl $0x7fffffff, %eax + +2: shrl $20, %eax + andl $0x800fffff, %edx + subl $1022, %eax + orl $0x3fe00000, %edx + addl %eax, %ecx +#ifdef PIC + movl %edx, 12(%esp) + + fldl 8(%esp) /* xm */ +#else + movl %edx, 8(%esp) + + fldl 4(%esp) /* xm */ +#endif + fabs + + /* The following code has two tracks: + a) compute the normalized cbrt value + b) compute xe/3 and xe%3 + The right track computes the value for b) and this is done + in an optimized way by avoiding division. + + But why two tracks at all? Very easy: efficiency. Some FP + instruction can overlap with a certain amount of integer (and + FP) instructions. So we get (except for the imull) all + instructions for free. */ + + fld %st(0) /* xm : xm */ + + fmull MO(f7) /* f7*xm : xm */ + movl $1431655766, %eax + faddl MO(f6) /* f6+f7*xm : xm */ + imull %ecx + fmul %st(1) /* (f6+f7*xm)*xm : xm */ + movl %ecx, %eax + faddl MO(f5) /* f5+(f6+f7*xm)*xm : xm */ + sarl $31, %eax + fmul %st(1) /* (f5+(f6+f7*xm)*xm)*xm : xm */ + subl %eax, %edx + faddl MO(f4) /* f4+(f5+(f6+f7*xm)*xm)*xm : xm */ + fmul %st(1) /* (f4+(f5+(f6+f7*xm)*xm)*xm)*xm : xm */ + faddl MO(f3) /* f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm : xm */ + fmul %st(1) /* (f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm : xm */ + faddl MO(f2) /* f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm : xm */ + fmul %st(1) /* (f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm)*xm : xm */ + faddl MO(f1) /* u:=f1+(f2+(f3+(f4+(f5+(f6+f7*xm)*xm)*xm)*xm)*xm)*xm : xm */ + + fld %st /* u : u : xm */ + fmul %st(1) /* u*u : u : xm */ + fld %st(2) /* xm : u*u : u : xm */ + fadd %st /* 2*xm : u*u : u : xm */ + fxch %st(1) /* u*u : 2*xm : u : xm */ + fmul %st(2) /* t2:=u*u*u : 2*xm : u : xm */ + movl %edx, %eax + fadd %st, %st(1) /* t2 : t2+2*xm : u : xm */ + leal (%edx,%edx,2),%edx + fadd %st(0) /* 2*t2 : t2+2*xm : u : xm */ + subl %edx, %ecx + faddp %st, %st(3) /* t2+2*xm : u : 2*t2+xm */ + shll $3, %ecx + fmulp /* u*(t2+2*xm) : 2*t2+xm */ + fdivp %st, %st(1) /* u*(t2+2*xm)/(2*t2+xm) */ + fmull MOX(16+factor,%ecx) /* u*(t2+2*xm)/(2*t2+xm)*FACT */ + pushl %eax + cfi_adjust_cfa_offset (4) + fildl (%esp) /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */ + fxch /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */ + fscale /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */ + popl %edx + cfi_adjust_cfa_offset (-4) +#ifdef PIC + movl 12(%esp), %eax + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) +#else + movl 8(%esp), %eax +#endif + testl %eax, %eax + fstp %st(1) + jns 4f + fchs +4: ret + + /* Return the argument. */ +1: fldl 4(%esp) + ret +END(__cbrt) +weak_alias (__cbrt, cbrt) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S new file mode 100644 index 0000000000..645d24372d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrtf.S @@ -0,0 +1,177 @@ +/* Compute cubic root of float value. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Dirk Alboth <dirka@uni-paderborn.de> and + Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type f3,@object +f3: .double 0.191502161678719066 + ASM_SIZE_DIRECTIVE(f3) + .type f2,@object +f2: .double 0.697570460207922770 + ASM_SIZE_DIRECTIVE(f2) + .type f1,@object +f1: .double 0.492659620528969547 + ASM_SIZE_DIRECTIVE(f1) + +#define CBRT2 1.2599210498948731648 +#define ONE_CBRT2 0.793700525984099737355196796584 +#define SQR_CBRT2 1.5874010519681994748 +#define ONE_SQR_CBRT2 0.629960524947436582364439673883 + + .type factor,@object + .align ALIGNARG(4) +factor: .double ONE_SQR_CBRT2 + .double ONE_CBRT2 + .double 1.0 + .double CBRT2 + .double SQR_CBRT2 + ASM_SIZE_DIRECTIVE(factor) + + .type two25,@object +two25: .byte 0, 0, 0, 0x4c + ASM_SIZE_DIRECTIVE(two25) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%ebx) +#define MOX(op,x) op##@GOTOFF(%ebx,x,1) +#else +#define MO(op) op +#define MOX(op,x) op(x) +#endif + + .text +ENTRY(__cbrtf) + movl 4(%esp), %eax + xorl %ecx, %ecx + movl %eax, %edx + andl $0x7fffffff, %eax + jz 1f + cmpl $0x7f800000, %eax + jae 1f + +#ifdef PIC + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + LOAD_PIC_REG (bx) +#endif + + cmpl $0x00800000, %eax + jae 2f + +#ifdef PIC + flds 8(%esp) +#else + flds 4(%esp) +#endif + fmuls MO(two25) + movl $-25, %ecx +#ifdef PIC + fstps 8(%esp) + movl 8(%esp), %eax +#else + fstps 4(%esp) + movl 4(%esp), %eax +#endif + movl %eax, %edx + andl $0x7fffffff, %eax + +2: shrl $23, %eax + andl $0x807fffff, %edx + subl $126, %eax + orl $0x3f000000, %edx + addl %eax, %ecx +#ifdef PIC + movl %edx, 8(%esp) + + flds 8(%esp) /* xm */ +#else + movl %edx, 4(%esp) + + flds 4(%esp) /* xm */ +#endif + fabs + + /* The following code has two tracks: + a) compute the normalized cbrt value + b) compute xe/3 and xe%3 + The right track computes the value for b) and this is done + in an optimized way by avoiding division. + + But why two tracks at all? Very easy: efficiency. Some FP + instruction can overlap with a certain amount of integer (and + FP) instructions. So we get (except for the imull) all + instructions for free. */ + + fld %st(0) /* xm : xm */ + fmull MO(f3) /* f3*xm : xm */ + movl $1431655766, %eax + fsubrl MO(f2) /* f2-f3*xm : xm */ + imull %ecx + fmul %st(1) /* (f2-f3*xm)*xm : xm */ + movl %ecx, %eax + faddl MO(f1) /* u:=f1+(f2-f3*xm)*xm : xm */ + sarl $31, %eax + fld %st /* u : u : xm */ + subl %eax, %edx + fmul %st(1) /* u*u : u : xm */ + fld %st(2) /* xm : u*u : u : xm */ + fadd %st /* 2*xm : u*u : u : xm */ + fxch %st(1) /* u*u : 2*xm : u : xm */ + fmul %st(2) /* t2:=u*u*u : 2*xm : u : xm */ + movl %edx, %eax + fadd %st, %st(1) /* t2 : t2+2*xm : u : xm */ + leal (%edx,%edx,2),%edx + fadd %st(0) /* 2*t2 : t2+2*xm : u : xm */ + subl %edx, %ecx + faddp %st, %st(3) /* t2+2*xm : u : 2*t2+xm */ + shll $3, %ecx + fmulp /* u*(t2+2*xm) : 2*t2+xm */ + fdivp %st, %st(1) /* u*(t2+2*xm)/(2*t2+xm) */ + fmull MOX(16+factor,%ecx) /* u*(t2+2*xm)/(2*t2+xm)*FACT */ + pushl %eax + cfi_adjust_cfa_offset (4) + fildl (%esp) /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */ + fxch /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */ + fscale /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */ + popl %edx + cfi_adjust_cfa_offset (-4) +#ifdef PIC + movl 8(%esp), %eax + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) +#else + movl 4(%esp), %eax +#endif + testl %eax, %eax + fstp %st(1) + jns 4f + fchs +4: ret + + /* Return the argument. */ +1: flds 4(%esp) + ret +END(__cbrtf) +weak_alias (__cbrtf, cbrtf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S b/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S new file mode 100644 index 0000000000..e4a72d29c6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_cbrtl.S @@ -0,0 +1,229 @@ +/* Compute cubic root of long double value. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Dirk Alboth <dirka@uni-paderborn.de> and + Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type f8,@object +f8: .tfloat 0.161617097923756032 + ASM_SIZE_DIRECTIVE(f8) + .align ALIGNARG(4) + .type f7,@object +f7: .tfloat -0.988553671195413709 + ASM_SIZE_DIRECTIVE(f7) + .align ALIGNARG(4) + .type f6,@object +f6: .tfloat 2.65298938441952296 + ASM_SIZE_DIRECTIVE(f6) + .align ALIGNARG(4) + .type f5,@object +f5: .tfloat -4.11151425200350531 + ASM_SIZE_DIRECTIVE(f5) + .align ALIGNARG(4) + .type f4,@object +f4: .tfloat 4.09559907378707839 + ASM_SIZE_DIRECTIVE(f4) + .align ALIGNARG(4) + .type f3,@object +f3: .tfloat -2.82414939754975962 + ASM_SIZE_DIRECTIVE(f3) + .align ALIGNARG(4) + .type f2,@object +f2: .tfloat 1.67595307700780102 + ASM_SIZE_DIRECTIVE(f2) + .align ALIGNARG(4) + .type f1,@object +f1: .tfloat 0.338058687610520237 + ASM_SIZE_DIRECTIVE(f1) + +#define CBRT2 1.2599210498948731648 +#define ONE_CBRT2 0.793700525984099737355196796584 +#define SQR_CBRT2 1.5874010519681994748 +#define ONE_SQR_CBRT2 0.629960524947436582364439673883 + + /* We make the entries in the following table all 16 bytes + wide to avoid having to implement a multiplication by 10. */ + .type factor,@object + .align ALIGNARG(4) +factor: .tfloat ONE_SQR_CBRT2 + .byte 0, 0, 0, 0, 0, 0 + .tfloat ONE_CBRT2 + .byte 0, 0, 0, 0, 0, 0 + .tfloat 1.0 + .byte 0, 0, 0, 0, 0, 0 + .tfloat CBRT2 + .byte 0, 0, 0, 0, 0, 0 + .tfloat SQR_CBRT2 + ASM_SIZE_DIRECTIVE(factor) + + .type two64,@object + .align ALIGNARG(4) +two64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43 + ASM_SIZE_DIRECTIVE(two64) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%ebx) +#define MOX(op,x) op##@GOTOFF(%ebx,x,1) +#else +#define MO(op) op +#define MOX(op,x) op(x) +#endif + + .text +ENTRY(__cbrtl) + movl 4(%esp), %ecx + movl 12(%esp), %eax + orl 8(%esp), %ecx + movl %eax, %edx + andl $0x7fff, %eax + orl %eax, %ecx + jz 1f + xorl %ecx, %ecx + cmpl $0x7fff, %eax + je 1f + +#ifdef PIC + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) + LOAD_PIC_REG (bx) +#endif + + cmpl $0, %eax + jne 2f + +#ifdef PIC + fldt 8(%esp) +#else + fldt 4(%esp) +#endif + fmull MO(two64) + movl $-64, %ecx +#ifdef PIC + fstpt 8(%esp) + movl 16(%esp), %eax +#else + fstpt 4(%esp) + movl 12(%esp), %eax +#endif + movl %eax, %edx + andl $0x7fff, %eax + +2: andl $0x8000, %edx + subl $16382, %eax + orl $0x3ffe, %edx + addl %eax, %ecx +#ifdef PIC + movl %edx, 16(%esp) + + fldt 8(%esp) /* xm */ +#else + movl %edx, 12(%esp) + + fldt 4(%esp) /* xm */ +#endif + fabs + + /* The following code has two tracks: + a) compute the normalized cbrt value + b) compute xe/3 and xe%3 + The right track computes the value for b) and this is done + in an optimized way by avoiding division. + + But why two tracks at all? Very easy: efficiency. Some FP + instruction can overlap with a certain amount of integer (and + FP) instructions. So we get (except for the imull) all + instructions for free. */ + + fldt MO(f8) /* f8 : xm */ + fmul %st(1) /* f8*xm : xm */ + + fldt MO(f7) + faddp /* f7+f8*xm : xm */ + fmul %st(1) /* (f7+f8*xm)*xm : xm */ + movl $1431655766, %eax + fldt MO(f6) + faddp /* f6+(f7+f8*xm)*xm : xm */ + imull %ecx + fmul %st(1) /* (f6+(f7+f8*xm)*xm)*xm : xm */ + movl %ecx, %eax + fldt MO(f5) + faddp /* f5+(f6+(f7+f8*xm)*xm)*xm : xm */ + sarl $31, %eax + fmul %st(1) /* (f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */ + subl %eax, %edx + fldt MO(f4) + faddp /* f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm : xm */ + fmul %st(1) /* (f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */ + fldt MO(f3) + faddp /* f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm : xm */ + fmul %st(1) /* (f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */ + fldt MO(f2) + faddp /* f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm : xm */ + fmul %st(1) /* (f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */ + fldt MO(f1) + faddp /* u:=f1+(f2+(f3+(f4+(f5+(f6+(f7+f8*xm)*xm)*xm)*xm)*xm)*xm)*xm : xm */ + + fld %st /* u : u : xm */ + fmul %st(1) /* u*u : u : xm */ + fld %st(2) /* xm : u*u : u : xm */ + fadd %st /* 2*xm : u*u : u : xm */ + fxch %st(1) /* u*u : 2*xm : u : xm */ + fmul %st(2) /* t2:=u*u*u : 2*xm : u : xm */ + movl %edx, %eax + fadd %st, %st(1) /* t2 : t2+2*xm : u : xm */ + leal (%edx,%edx,2),%edx + fadd %st(0) /* 2*t2 : t2+2*xm : u : xm */ + subl %edx, %ecx + faddp %st, %st(3) /* t2+2*xm : u : 2*t2+xm */ + shll $4, %ecx + fmulp /* u*(t2+2*xm) : 2*t2+xm */ + fdivp %st, %st(1) /* u*(t2+2*xm)/(2*t2+xm) */ + fldt MOX(32+factor,%ecx) + fmulp /* u*(t2+2*xm)/(2*t2+xm)*FACT */ + pushl %eax + cfi_adjust_cfa_offset (4) + fildl (%esp) /* xe/3 : u*(t2+2*xm)/(2*t2+xm)*FACT */ + fxch /* u*(t2+2*xm)/(2*t2+xm)*FACT : xe/3 */ + fscale /* u*(t2+2*xm)/(2*t2+xm)*FACT*2^xe/3 */ + popl %edx + cfi_adjust_cfa_offset (-4) +#ifdef PIC + movl 16(%esp), %eax + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) +#else + movl 12(%esp), %eax +#endif + testl $0x8000, %eax + fstp %st(1) + jz 4f + fchs +4: ret + + /* Return the argument. */ +1: fldt 4(%esp) + fadd %st + ret +END(__cbrtl) +weak_alias (__cbrtl, cbrtl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceil.S b/REORG.TODO/sysdeps/i386/fpu/s_ceil.S new file mode 100644 index 0000000000..1226bb2f87 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_ceil.S @@ -0,0 +1,34 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_ceil.S,v 1.4 1995/05/08 23:52:13 jtc Exp $") + +ENTRY(__ceil) + fldl 4(%esp) + subl $32,%esp + cfi_adjust_cfa_offset (32) + + fnstenv 4(%esp) /* store fpu environment */ + + /* We use here %edx although only the low 1 bits are defined. + But none of the operations should care and they are faster + than the 16 bit operations. */ + movl $0x0800,%edx /* round towards +oo */ + orl 4(%esp),%edx + andl $0xfbff,%edx + movl %edx,(%esp) + fldcw (%esp) /* load modified control word */ + + frndint /* round */ + + fldenv 4(%esp) /* restore original environment */ + + addl $32,%esp + cfi_adjust_cfa_offset (-32) + ret +END (__ceil) +weak_alias (__ceil, ceil) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S b/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S new file mode 100644 index 0000000000..d345c0973b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_ceilf.S @@ -0,0 +1,34 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_ceilf.S,v 1.3 1995/05/08 23:52:44 jtc Exp $") + +ENTRY(__ceilf) + flds 4(%esp) + subl $32,%esp + cfi_adjust_cfa_offset (32) + + fnstenv 4(%esp) /* store fpu environment */ + + /* We use here %edx although only the low 1 bits are defined. + But none of the operations should care and they are faster + than the 16 bit operations. */ + movl $0x0800,%edx /* round towards +oo */ + orl 4(%esp),%edx + andl $0xfbff,%edx + movl %edx,(%esp) + fldcw (%esp) /* load modified control word */ + + frndint /* round */ + + fldenv 4(%esp) /* restore original environment */ + + addl $32,%esp + cfi_adjust_cfa_offset (-32) + ret +END (__ceilf) +weak_alias (__ceilf, ceilf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_ceill.S b/REORG.TODO/sysdeps/i386/fpu/s_ceill.S new file mode 100644 index 0000000000..7c08f43b24 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_ceill.S @@ -0,0 +1,40 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: $") + +ENTRY(__ceill) + fldt 4(%esp) + subl $32,%esp + cfi_adjust_cfa_offset (32) + + fnstenv 4(%esp) /* store fpu environment */ + + /* We use here %edx although only the low 1 bits are defined. + But none of the operations should care and they are faster + than the 16 bit operations. */ + movl $0x0800,%edx /* round towards +oo */ + orl 4(%esp),%edx + andl $0xfbff,%edx + movl %edx,(%esp) + fldcw (%esp) /* load modified control word */ + + frndint /* round */ + + /* Preserve "invalid" exceptions from sNaN input. */ + fnstsw + andl $0x1, %eax + orl %eax, 8(%esp) + + fldenv 4(%esp) /* restore original environment */ + + addl $32,%esp + cfi_adjust_cfa_offset (-32) + ret +END (__ceill) +weak_alias (__ceill, ceill) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysign.S b/REORG.TODO/sysdeps/i386/fpu/s_copysign.S new file mode 100644 index 0000000000..2520a94427 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_copysign.S @@ -0,0 +1,20 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_copysign.S,v 1.4 1995/05/08 23:53:02 jtc Exp $") + +ENTRY(__copysign) + movl 16(%esp),%edx + movl 8(%esp),%eax + andl $0x80000000,%edx + andl $0x7fffffff,%eax + orl %edx,%eax + movl %eax,8(%esp) + fldl 4(%esp) + ret +END (__copysign) +weak_alias (__copysign, copysign) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S b/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S new file mode 100644 index 0000000000..57b1a6f119 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_copysignf.S @@ -0,0 +1,20 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_copysignf.S,v 1.3 1995/05/08 23:53:25 jtc Exp $") + +ENTRY(__copysignf) + movl 8(%esp),%edx + movl 4(%esp),%eax + andl $0x80000000,%edx + andl $0x7fffffff,%eax + orl %edx,%eax + movl %eax,4(%esp) + flds 4(%esp) + ret +END (__copysignf) +weak_alias (__copysignf, copysignf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S b/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S new file mode 100644 index 0000000000..2163e7b014 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_copysignl.S @@ -0,0 +1,21 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: $") + +ENTRY(__copysignl) + movl 24(%esp),%edx + movl 12(%esp),%eax + andl $0x8000,%edx + andl $0x7fff,%eax + orl %edx,%eax + movl %eax,12(%esp) + fldt 4(%esp) + ret +END (__copysignl) +weak_alias (__copysignl, copysignl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1.S new file mode 100644 index 0000000000..59fded2d5a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1.S @@ -0,0 +1,113 @@ +/* ix87 specific implementation of exp(x)-1. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + Based on code by John C. Bowman <bowman@ipp-garching.mpg.de>. + Corrections by H.J. Lu (hjl@gnu.ai.mit.edu), 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + /* Using: e^x - 1 = 2^(x * log2(e)) - 1 */ + +#include <sysdep.h> +#include <machine/asm.h> +#include <i386-math-asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type minus1,@object +minus1: .double -1.0 + ASM_SIZE_DIRECTIVE(minus1) + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type l2e,@object +l2e: .tfloat 1.442695040888963407359924681002 + ASM_SIZE_DIRECTIVE(l2e) + +DEFINE_DBL_MIN + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__expm1) + movzwl 4+6(%esp), %eax + xorb $0x80, %ah // invert sign bit (now 1 is "positive") + cmpl $0xc086, %eax // is num >= 704? + jae HIDDEN_JUMPTARGET (__exp) + + fldl 4(%esp) // x + fxam // Is NaN, +-Inf or +-0? + xorb $0x80, %ah + cmpl $0xc043, %eax // is num <= -38.0? + fstsw %ax + movb $0x45, %ch + jb 4f + + // Below -38.0 (may be -NaN or -Inf). + andb %ah, %ch +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + cmpb $0x01, %ch + je 5f // If -NaN, jump. + jmp 2f // -large, possibly -Inf. + +4: // In range -38.0 to 704.0 (may be +-0 but not NaN or +-Inf). + andb %ah, %ch + cmpb $0x40, %ch + je 3f // If +-0, jump. +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + +5: fldt MO(l2e) // log2(e) : x + fmulp // log2(e)*x + fld %st // log2(e)*x : log2(e)*x + // Set round-to-nearest temporarily. + subl $8, %esp + cfi_adjust_cfa_offset (8) + fstcw 4(%esp) + movl $0xf3ff, %ecx + andl 4(%esp), %ecx + movl %ecx, (%esp) + fldcw (%esp) + frndint // int(log2(e)*x) : log2(e)*x + fldcw 4(%esp) + addl $8, %esp + cfi_adjust_cfa_offset (-8) + fsubr %st, %st(1) // int(log2(e)*x) : fract(log2(e)*x) + fxch // fract(log2(e)*x) : int(log2(e)*x) + f2xm1 // 2^fract(log2(e)*x)-1 : int(log2(e)*x) + fscale // 2^(log2(e)*x)-2^int(log2(e)*x) : int(log2(e)*x) + fxch // int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x) + fldl MO(one) // 1 : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x) + fscale // 2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x) + fsubrl MO(one) // 1-2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x) + fstp %st(1) // 1-2^int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x) + fsubrp %st, %st(1) // 2^(log2(e)*x) + DBL_CHECK_FORCE_UFLOW + ret + +2: fstp %st + fldl MO(minus1) // Set result to -1.0. +3: ret +END(__expm1) +weak_alias (__expm1, expm1) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S new file mode 100644 index 0000000000..4f0b2e7832 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1f.S @@ -0,0 +1,113 @@ +/* ix87 specific implementation of exp(x)-1. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + Based on code by John C. Bowman <bowman@ipp-garching.mpg.de>. + Corrections by H.J. Lu (hjl@gnu.ai.mit.edu), 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + /* Using: e^x - 1 = 2^(x * log2(e)) - 1 */ + +#include <sysdep.h> +#include <machine/asm.h> +#include <i386-math-asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type minus1,@object +minus1: .double -1.0 + ASM_SIZE_DIRECTIVE(minus1) + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + .type l2e,@object +l2e: .tfloat 1.442695040888963407359924681002 + ASM_SIZE_DIRECTIVE(l2e) + +DEFINE_FLT_MIN + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + + .text +ENTRY(__expm1f) + movzwl 4+2(%esp), %eax + xorb $0x80, %ah // invert sign bit (now 1 is "positive") + cmpl $0xc2b1, %eax // is num >= 88.5? + jae HIDDEN_JUMPTARGET (__expf) + + flds 4(%esp) // x + fxam // Is NaN, +-Inf or +-0? + xorb $0x80, %ah + cmpl $0xc190, %eax // is num <= -18.0? + fstsw %ax + movb $0x45, %ch + jb 4f + + // Below -18.0 (may be -NaN or -Inf). + andb %ah, %ch +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + cmpb $0x01, %ch + je 5f // If -NaN, jump. + jmp 2f // -large, possibly -Inf. + +4: // In range -18.0 to 88.5 (may be +-0 but not NaN or +-Inf). + andb %ah, %ch + cmpb $0x40, %ch + je 3f // If +-0, jump. +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + +5: fldt MO(l2e) // log2(e) : x + fmulp // log2(e)*x + fld %st // log2(e)*x : log2(e)*x + // Set round-to-nearest temporarily. + subl $8, %esp + cfi_adjust_cfa_offset (8) + fstcw 4(%esp) + movl $0xf3ff, %ecx + andl 4(%esp), %ecx + movl %ecx, (%esp) + fldcw (%esp) + frndint // int(log2(e)*x) : log2(e)*x + fldcw 4(%esp) + addl $8, %esp + cfi_adjust_cfa_offset (-8) + fsubr %st, %st(1) // int(log2(e)*x) : fract(log2(e)*x) + fxch // fract(log2(e)*x) : int(log2(e)*x) + f2xm1 // 2^fract(log2(e)*x)-1 : int(log2(e)*x) + fscale // 2^(log2(e)*x)-2^int(log2(e)*x) : int(log2(e)*x) + fxch // int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x) + fldl MO(one) // 1 : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x) + fscale // 2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x) + fsubrl MO(one) // 1-2^int(log2(e)*x) : int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x) + fstp %st(1) // 1-2^int(log2(e)*x) : 2^(log2(e)*x)-2^int(log2(e)*x) + fsubrp %st, %st(1) // 2^(log2(e)*x) + FLT_CHECK_FORCE_UFLOW + ret + +2: fstp %st + fldl MO(minus1) // Set result to -1.0. +3: ret +END(__expm1f) +weak_alias (__expm1f, expm1f) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S b/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S new file mode 100644 index 0000000000..7fbd99b0db --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_expm1l.S @@ -0,0 +1,2 @@ +#define USE_AS_EXPM1L +#include <e_expl.S> diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabs.S b/REORG.TODO/sysdeps/i386/fpu/s_fabs.S new file mode 100644 index 0000000000..23ae9dccb9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fabs.S @@ -0,0 +1,9 @@ +#include <sysdep.h> + + .text +ENTRY(__fabs) + fldl 4(%esp) + fabs + ret +END(__fabs) +weak_alias (__fabs, fabs) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S b/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S new file mode 100644 index 0000000000..c0407a8839 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fabsf.S @@ -0,0 +1,9 @@ +#include <sysdep.h> + + .text +ENTRY(__fabsf) + flds 4(%esp) + fabs + ret +END(__fabsf) +weak_alias (__fabsf, fabsf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S b/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S new file mode 100644 index 0000000000..a12a3e050b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fabsl.S @@ -0,0 +1,9 @@ +#include <sysdep.h> + + .text +ENTRY(__fabsl) + fldt 4(%esp) + fabs + ret +END(__fabsl) +weak_alias (__fabsl, fabsl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fdim.c b/REORG.TODO/sysdeps/i386/fpu/s_fdim.c new file mode 100644 index 0000000000..6243c62998 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fdim.c @@ -0,0 +1,50 @@ +/* Return positive difference between arguments. i386 version. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <errno.h> +#include <fpu_control.h> +#include <math.h> +#include <math_private.h> + +double +__fdim (double x, double y) +{ + if (islessequal (x, y)) + return 0.0; + + /* To avoid double rounding, set double precision for the + subtraction. math_narrow_eval is still needed to eliminate + excess range in the case of overflow. If the result of the + subtraction is in the subnormal range for double, it is exact, so + no issues of double rounding for subnormals arise. */ + fpu_control_t cw, cw_double; + _FPU_GETCW (cw); + cw_double = (cw & ~_FPU_EXTENDED) | _FPU_DOUBLE; + _FPU_SETCW (cw_double); + double r = math_narrow_eval (x - y); + _FPU_SETCW (cw); + if (isinf (r) && !isinf (x) && !isinf (y)) + __set_errno (ERANGE); + + return r; +} +weak_alias (__fdim, fdim) +#ifdef NO_LONG_DOUBLE +strong_alias (__fdim, __fdiml) +weak_alias (__fdim, fdiml) +#endif diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finite.S b/REORG.TODO/sysdeps/i386/fpu/s_finite.S new file mode 100644 index 0000000000..1ae4aed451 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_finite.S @@ -0,0 +1,17 @@ +/* + * Written by Joe Keane <jgk@jgk.org>. + */ + +#include <machine/asm.h> + +ENTRY(__finite) + movl 8(%esp),%eax + movl $0xFFEFFFFF,%ecx + subl %eax,%ecx + xorl %ecx,%eax + shrl $31, %eax + ret +END (__finite) +weak_alias (__finite, finite) +hidden_def (__finite) + diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finitef.S b/REORG.TODO/sysdeps/i386/fpu/s_finitef.S new file mode 100644 index 0000000000..69e72facff --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_finitef.S @@ -0,0 +1,16 @@ +/* + * Written by Joe Keane <jgk@jgk.org>. + */ + +#include <machine/asm.h> + +ENTRY(__finitef) + movl 4(%esp),%eax + movl $0xFF7FFFFF,%ecx + subl %eax,%ecx + xorl %ecx,%eax + shrl $31,%eax + ret +END (__finitef) +weak_alias (__finitef, finitef) +hidden_def (__finitef) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_finitel.S b/REORG.TODO/sysdeps/i386/fpu/s_finitel.S new file mode 100644 index 0000000000..cce90e18fc --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_finitel.S @@ -0,0 +1,15 @@ +/* + * Written by Joe Keane <jgk@jgk.org>. + */ + +#include <machine/asm.h> + +ENTRY(__finitel) + movl 12(%esp),%eax + orl $0xffff8000, %eax + incl %eax + shrl $31, %eax + ret +END (__finitel) +weak_alias (__finitel, finitel) +hidden_def (__finitel) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floor.S b/REORG.TODO/sysdeps/i386/fpu/s_floor.S new file mode 100644 index 0000000000..ed837dae40 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_floor.S @@ -0,0 +1,34 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_floor.S,v 1.4 1995/05/09 00:01:59 jtc Exp $") + +ENTRY(__floor) + fldl 4(%esp) + subl $32,%esp + cfi_adjust_cfa_offset (32) + + fnstenv 4(%esp) /* store fpu environment */ + + /* We use here %edx although only the low 1 bits are defined. + But none of the operations should care and they are faster + than the 16 bit operations. */ + movl $0x400,%edx /* round towards -oo */ + orl 4(%esp),%edx + andl $0xf7ff,%edx + movl %edx,(%esp) + fldcw (%esp) /* load modified control word */ + + frndint /* round */ + + fldenv 4(%esp) /* restore original environment */ + + addl $32,%esp + cfi_adjust_cfa_offset (-32) + ret +END (__floor) +weak_alias (__floor, floor) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floorf.S b/REORG.TODO/sysdeps/i386/fpu/s_floorf.S new file mode 100644 index 0000000000..84b6f7ed99 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_floorf.S @@ -0,0 +1,34 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_floorf.S,v 1.3 1995/05/09 00:04:32 jtc Exp $") + +ENTRY(__floorf) + flds 4(%esp) + subl $32,%esp + cfi_adjust_cfa_offset (32) + + fnstenv 4(%esp) /* store fpu environment */ + + /* We use here %edx although only the low 1 bits are defined. + But none of the operations should care and they are faster + than the 16 bit operations. */ + movl $0x400,%edx /* round towards -oo */ + orl 4(%esp),%edx + andl $0xf7ff,%edx + movl %edx,(%esp) + fldcw (%esp) /* load modified control word */ + + frndint /* round */ + + fldenv 4(%esp) /* restore original environment */ + + addl $32,%esp + cfi_adjust_cfa_offset (-32) + ret +END (__floorf) +weak_alias (__floorf, floorf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_floorl.S b/REORG.TODO/sysdeps/i386/fpu/s_floorl.S new file mode 100644 index 0000000000..dc74a0c446 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_floorl.S @@ -0,0 +1,40 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: $") + +ENTRY(__floorl) + fldt 4(%esp) + subl $32,%esp + cfi_adjust_cfa_offset (32) + + fnstenv 4(%esp) /* store fpu environment */ + + /* We use here %edx although only the low 1 bits are defined. + But none of the operations should care and they are faster + than the 16 bit operations. */ + movl $0x400,%edx /* round towards -oo */ + orl 4(%esp),%edx + andl $0xf7ff,%edx + movl %edx,(%esp) + fldcw (%esp) /* load modified control word */ + + frndint /* round */ + + /* Preserve "invalid" exceptions from sNaN input. */ + fnstsw + andl $0x1, %eax + orl %eax, 8(%esp) + + fldenv 4(%esp) /* restore original environment */ + + addl $32,%esp + cfi_adjust_cfa_offset (-32) + ret +END (__floorl) +weak_alias (__floorl, floorl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmax.S b/REORG.TODO/sysdeps/i386/fpu/s_fmax.S new file mode 100644 index 0000000000..218dcef421 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fmax.S @@ -0,0 +1,43 @@ +/* Compute maximum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmax) + fldl 12(%esp) // y + fxam + fnstsw + fldl 4(%esp) // y : x + + andb $0x45, %ah + cmpb $0x01, %ah + je 1f // y == NaN + + fucom %st(1) + fnstsw + sahf + jnc 1f + + fxch %st(1) +1: fstp %st(1) + + ret +END(__fmax) +weak_alias (__fmax, fmax) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S new file mode 100644 index 0000000000..b7a00cefeb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fmaxf.S @@ -0,0 +1,43 @@ +/* Compute maximum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmaxf) + flds 8(%esp) // y + fxam + fnstsw + flds 4(%esp) // y : x + + andb $0x45, %ah + cmpb $0x01, %ah + je 1f // y == NaN + + fucom %st(1) + fnstsw + sahf + jnc 1f + + fxch %st(1) +1: fstp %st(1) + + ret +END(__fmaxf) +weak_alias (__fmaxf, fmaxf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S new file mode 100644 index 0000000000..68162921db --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fmaxl.S @@ -0,0 +1,71 @@ +/* Compute maximum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmaxl) + fldt 16(%esp) // y + fxam + fnstsw + fldt 4(%esp) // y : x + + andb $0x45, %ah + cmpb $0x01, %ah + je 2f // y == NaN + + fxam + fnstsw + andb $0x45, %ah + cmpb $0x01, %ah + je 3f // x == NaN + + fucom %st(1) + fnstsw + sahf + jnc 1f + + fxch %st(1) +1: fstp %st(1) + + ret + +2: // st(1) is a NaN; st(0) may or may not be. + fxam + fnstsw + andb $0x45, %ah + cmpb $0x01, %ah + je 4f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 23(%esp) + jz 4f + fstp %st(1) + ret + +3: // st(0) is a NaN; st(1) is not. Test if st(0) is signaling. + testb $0x40, 11(%esp) + jz 4f + fstp %st(0) + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + faddp + ret +END(__fmaxl) +weak_alias (__fmaxl, fmaxl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fmin.S b/REORG.TODO/sysdeps/i386/fpu/s_fmin.S new file mode 100644 index 0000000000..a5bb0e06dd --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fmin.S @@ -0,0 +1,43 @@ +/* Compute minimum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmin) + fldl 4(%esp) // x + fldl 12(%esp) // x : y + + fxam + fnstsw + andb $0x45, %ah + cmpb $0x01, %ah + je 1f // y == NaN + + fucom %st(1) + fnstsw + sahf + jc 2f + +1: fxch %st(1) +2: fstp %st(1) + + ret +END(__fmin) +weak_alias (__fmin, fmin) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fminf.S b/REORG.TODO/sysdeps/i386/fpu/s_fminf.S new file mode 100644 index 0000000000..fba4a41120 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fminf.S @@ -0,0 +1,43 @@ +/* Compute minimum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fminf) + flds 4(%esp) // x + flds 8(%esp) // x : y + + fxam + fnstsw + andb $0x45, %ah + cmpb $0x01, %ah + je 1f // y == NaN + + fucom %st(1) + fnstsw + sahf + jc 2f + +1: fxch %st(1) +2: fstp %st(1) + + ret +END(__fminf) +weak_alias (__fminf, fminf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fminl.S b/REORG.TODO/sysdeps/i386/fpu/s_fminl.S new file mode 100644 index 0000000000..12ef21fda9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fminl.S @@ -0,0 +1,71 @@ +/* Compute minimum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fminl) + fldt 16(%esp) // y + fxam + fnstsw + fldt 4(%esp) // y : x + + andb $0x45, %ah + cmpb $0x01, %ah + je 2f // y == NaN + + fxam + fnstsw + andb $0x45, %ah + cmpb $0x01, %ah + je 3f // x == NaN + + fucom %st(1) + fnstsw + sahf + jc 1f + + fxch %st(1) +1: fstp %st(1) + + ret + +2: // st(1) is a NaN; st(0) may or may not be. + fxam + fnstsw + andb $0x45, %ah + cmpb $0x01, %ah + je 4f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 23(%esp) + jz 4f + fstp %st(1) + ret + +3: // st(0) is a NaN; st(1) is not. Test if st(0) is signaling. + testb $0x40, 11(%esp) + jz 4f + fstp %st(0) + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + faddp + ret +END(__fminl) +weak_alias (__fminl, fminl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c b/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c new file mode 100644 index 0000000000..ce19fd0035 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_fpclassifyl.c @@ -0,0 +1,42 @@ +/* Return classification value corresponding to argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> + +#include <math_private.h> + + +int +__fpclassifyl (long double x) +{ + u_int32_t ex, hx, lx; + int retval = FP_NORMAL; + + GET_LDOUBLE_WORDS (ex, hx, lx, x); + ex &= 0x7fff; + if ((ex | lx | hx) == 0) + retval = FP_ZERO; + else if (ex == 0 && (hx & 0x80000000) == 0) + retval = FP_SUBNORMAL; + else if (ex == 0x7fff) + retval = ((hx & 0x7fffffff) | lx) != 0 ? FP_NAN : FP_INFINITE; + + return retval; +} +libm_hidden_def (__fpclassifyl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexp.S b/REORG.TODO/sysdeps/i386/fpu/s_frexp.S new file mode 100644 index 0000000000..104f733bf6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_frexp.S @@ -0,0 +1,83 @@ +/* ix87 specific frexp implementation for double. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type two54,@object +two54: .byte 0, 0, 0, 0, 0, 0, 0x50, 0x43 + ASM_SIZE_DIRECTIVE(two54) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + +#define PARMS 4 /* no space for saved regs */ +#define VAL0 PARMS +#define VAL1 VAL0+4 +#define EXPP VAL1+4 + + .text +ENTRY (__frexp) + + movl VAL0(%esp), %ecx + movl VAL1(%esp), %eax + movl %eax, %edx + andl $0x7fffffff, %eax + orl %eax, %ecx + jz 1f + xorl %ecx, %ecx + cmpl $0x7ff00000, %eax + jae 1f + + cmpl $0x00100000, %eax + jae 2f + + fldl VAL0(%esp) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fmull MO(two54) + movl $-54, %ecx + fstpl VAL0(%esp) + fwait + movl VAL1(%esp), %eax + movl %eax, %edx + andl $0x7fffffff, %eax + +2: shrl $20, %eax + andl $0x800fffff, %edx + subl $1022, %eax + orl $0x3fe00000, %edx + addl %eax, %ecx + movl %edx, VAL1(%esp) + + /* Store %ecx in the variable pointed to by the second argument, + get the factor from the stack and return. */ +1: movl EXPP(%esp), %eax + fldl VAL0(%esp) + movl %ecx, (%eax) + + ret +END (__frexp) +weak_alias (__frexp, frexp) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S b/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S new file mode 100644 index 0000000000..f21c39ec4b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_frexpf.S @@ -0,0 +1,80 @@ +/* ix87 specific frexp implementation for float. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type two25,@object +two25: .byte 0, 0, 0, 0x4c + ASM_SIZE_DIRECTIVE(two25) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + +#define PARMS 4 /* no space for saved regs */ +#define VAL PARMS +#define EXPP VAL+4 + + .text +ENTRY (__frexpf) + + movl VAL(%esp), %eax + xorl %ecx, %ecx + movl %eax, %edx + andl $0x7fffffff, %eax + jz 1f + cmpl $0x7f800000, %eax + jae 1f + + cmpl $0x00800000, %eax + jae 2f + + flds VAL(%esp) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fmuls MO(two25) + movl $-25, %ecx + fstps VAL(%esp) + fwait + movl VAL(%esp), %eax + movl %eax, %edx + andl $0x7fffffff, %eax + +2: shrl $23, %eax + andl $0x807fffff, %edx + subl $126, %eax + orl $0x3f000000, %edx + addl %eax, %ecx + movl %edx, VAL(%esp) + + /* Store %ecx in the variable pointed to by the second argument, + get the factor from the stack and return. */ +1: movl EXPP(%esp), %eax + flds VAL(%esp) + movl %ecx, (%eax) + + ret +END (__frexpf) +weak_alias (__frexpf, frexpf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S b/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S new file mode 100644 index 0000000000..04f28888d2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_frexpl.S @@ -0,0 +1,92 @@ +/* ix87 specific frexp implementation for long double. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + + .section .rodata + + .align ALIGNARG(4) + .type two64,@object +two64: .byte 0, 0, 0, 0, 0, 0, 0xf0, 0x43 + ASM_SIZE_DIRECTIVE(two64) + +#ifdef PIC +#define MO(op) op##@GOTOFF(%edx) +#else +#define MO(op) op +#endif + +#define PARMS 4 /* no space for saved regs */ +#define VAL0 PARMS +#define VAL1 VAL0+4 +#define VAL2 VAL1+4 +#define EXPP VAL2+4 + + .text +ENTRY (__frexpl) + + movl VAL0(%esp), %ecx + movl VAL2(%esp), %eax + orl VAL1(%esp), %ecx + movl %eax, %edx + andl $0x7fff, %eax + orl %eax, %ecx + jz 1f + xorl %ecx, %ecx + cmpl $0x7fff, %eax + je 3f + + cmpl $0, %eax + jne 2f + + fldt VAL0(%esp) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + + fmull MO(two64) /* It's not necessary to use a 80bit factor */ + movl $-64, %ecx + fstpt VAL0(%esp) + fwait + movl VAL2(%esp), %eax + movl %eax, %edx + andl $0x7fff, %eax + +2: andl $0x8000, %edx + subl $16382, %eax + orl $0x3ffe, %edx + addl %eax, %ecx + movl %edx, VAL2(%esp) + + /* Store %ecx in the variable pointed to by the second argument, + get the factor from the stack and return. */ +1: movl EXPP(%esp), %eax + fldt VAL0(%esp) + movl %ecx, (%eax) + + ret + + /* Infinity or NaN; ensure signaling NaNs are quieted. */ +3: movl EXPP(%esp), %eax + fldt VAL0(%esp) + fadd %st + movl %ecx, (%eax) + ret +END (__frexpl) +weak_alias (__frexpl, frexpl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c b/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c new file mode 100644 index 0000000000..cdd77183fa --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_isinfl.c @@ -0,0 +1,32 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Change for long double by Ulrich Drepper <drepper@cygnus.com>. + * Intel i387 specific version. + * Public domain. + */ + +#if defined(LIBM_SCCS) && !defined(lint) +static char rcsid[] = "$NetBSD: $"; +#endif + +/* + * isinfl(x) returns 1 if x is inf, -1 if x is -inf, else 0; + * no branching! + */ + +#include <math.h> +#include <math_private.h> + +int __isinfl(long double x) +{ + int32_t se,hx,lx; + GET_LDOUBLE_WORDS(se,hx,lx,x); + /* This additional ^ 0x80000000 is necessary because in Intel's + internal representation of the implicit one is explicit. */ + lx |= (hx ^ 0x80000000) | ((se & 0x7fff) ^ 0x7fff); + lx |= -lx; + se &= 0x8000; + return ~(lx >> 31) & (1 - (se >> 14)); +} +hidden_def (__isinfl) +weak_alias (__isinfl, isinfl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c b/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c new file mode 100644 index 0000000000..816396d8fb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_isnanl.c @@ -0,0 +1,43 @@ +/* s_isnanl.c -- long double version for i387 of s_isnan.c. + * Conversion to long double by Ulrich Drepper, + * Cygnus Support, drepper@cygnus.com. + */ + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +#if defined(LIBM_SCCS) && !defined(lint) +static char rcsid[] = "$NetBSD: $"; +#endif + +/* + * isnanl(x) returns 1 is x is nan, else 0; + * no branching! + */ + +#include <math.h> +#include <math_private.h> + +int __isnanl(long double x) +{ + int32_t se,hx,lx; + GET_LDOUBLE_WORDS(se,hx,lx,x); + se = (se & 0x7fff) << 1; + /* The additional & 0x7fffffff is required because Intel's + extended format has the normally implicit 1 explicit + present. Sigh! */ + lx |= hx & 0x7fffffff; + se |= (u_int32_t)(lx|(-lx))>>31; + se = 0xfffe - se; + return (int)((u_int32_t)(se))>>16; +} +hidden_def (__isnanl) +weak_alias (__isnanl, isnanl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrint.S b/REORG.TODO/sysdeps/i386/fpu/s_llrint.S new file mode 100644 index 0000000000..a597183aab --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_llrint.S @@ -0,0 +1,36 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__llrint) + fldl 4(%esp) + subl $8, %esp + cfi_adjust_cfa_offset (8) + fistpll (%esp) + fwait + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + ret +END(__llrint) +weak_alias (__llrint, llrint) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S b/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S new file mode 100644 index 0000000000..a4b574eccb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_llrintf.S @@ -0,0 +1,36 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__llrintf) + flds 4(%esp) + subl $8, %esp + cfi_adjust_cfa_offset (8) + fistpll (%esp) + fwait + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + ret +END(__llrintf) +weak_alias (__llrintf, llrintf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S b/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S new file mode 100644 index 0000000000..7b48c02ef4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_llrintl.S @@ -0,0 +1,36 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__llrintl) + fldt 4(%esp) + subl $8, %esp + cfi_adjust_cfa_offset (8) + fistpll (%esp) + fwait + popl %eax + cfi_adjust_cfa_offset (-4) + popl %edx + cfi_adjust_cfa_offset (-4) + ret +END(__llrintl) +weak_alias (__llrintl, llrintl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1p.S b/REORG.TODO/sysdeps/i386/fpu/s_log1p.S new file mode 100644 index 0000000000..7978e76095 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_log1p.S @@ -0,0 +1,67 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +RCSID("$NetBSD: s_log1p.S,v 1.7 1995/05/09 00:10:58 jtc Exp $") + + .section .rodata + + .align ALIGNARG(4) + /* The fyl2xp1 can only be used for values in + -1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2 + 0.29 is a safe value. + */ +limit: .double 0.29 +one: .double 1.0 + +DEFINE_DBL_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + +/* + * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29, + * otherwise fyl2x with the needed extra computation. + */ + .text +ENTRY(__log1p) + fldln2 + + fldl 4(%esp) + +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + + fxam + fnstsw + fld %st + sahf + jc 3f // in case x is NaN or ±Inf +4: fabs + fcompl MO(limit) + fnstsw + sahf + jc 2f + + faddl MO(one) + fyl2x + ret + +2: fyl2xp1 + DBL_CHECK_FORCE_UFLOW_NONNAN + ret + +3: jp 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + ret + +END (__log1p) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S b/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S new file mode 100644 index 0000000000..acaa299d94 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_log1pf.S @@ -0,0 +1,67 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +RCSID("$NetBSD: s_log1pf.S,v 1.4 1995/05/09 00:13:05 jtc Exp $") + + .section .rodata + + .align ALIGNARG(4) + /* The fyl2xp1 can only be used for values in + -1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2 + 0.29 is a safe value. + */ +limit: .float 0.29 +one: .float 1.0 + +DEFINE_FLT_MIN + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + +/* + * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29, + * otherwise fyl2x with the needed extra computation. + */ + .text +ENTRY(__log1pf) + fldln2 + + flds 4(%esp) + +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + + fxam + fnstsw + fld %st + sahf + jc 3f // in case x is NaN or ±Inf +4: fabs + fcomps MO(limit) + fnstsw + sahf + jc 2f + + fadds MO(one) + fyl2x + ret + +2: fyl2xp1 + FLT_CHECK_FORCE_UFLOW_NONNAN + ret + +3: jp 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + ret + +END (__log1pf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S b/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S new file mode 100644 index 0000000000..0fd05cbdb3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_log1pl.S @@ -0,0 +1,76 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_log1p.S,v 1.7 1995/05/09 00:10:58 jtc Exp $") + + .section .rodata + + .align ALIGNARG(4) + /* The fyl2xp1 can only be used for values in + -1 + sqrt(2) / 2 <= x <= 1 - sqrt(2) / 2 + 0.29 is a safe value. + */ +limit: .tfloat 0.29 + /* Please note: we use a double value here. Since 1.0 has + an exact representation this does not effect the accuracy + but it helps to optimize the code. */ +one: .double 1.0 + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + +/* + * Use the fyl2xp1 function when the argument is in the range -0.29 to 0.29, + * otherwise fyl2x with the needed extra computation. + */ + .text +ENTRY(__log1pl) + fldln2 + + fldt 4(%esp) + +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + + fxam + fnstsw + fld %st + sahf + jc 3f // in case x is NaN or ±Inf +4: + fabs + fldt MO(limit) + fcompp + fnstsw + sahf + jnc 2f + + movzwl 4+8(%esp), %eax + xorb $0x80, %ah + cmpl $0xc040, %eax + jae 5f + + faddl MO(one) +5: fyl2x + ret + +2: fyl2xp1 + ret + +3: jp 4b // in case x is ±Inf + fstp %st(1) + fstp %st(1) + fadd %st(0) + ret + +END (__log1pl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logb.S b/REORG.TODO/sysdeps/i386/fpu/s_logb.S new file mode 100644 index 0000000000..f78c091c8a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_logb.S @@ -0,0 +1,16 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_logb.S,v 1.4 1995/05/09 00:14:30 jtc Exp $") + +ENTRY(__logb) + fldl 4(%esp) + fxtract + fstp %st + ret +END (__logb) +weak_alias (__logb, logb) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logbf.S b/REORG.TODO/sysdeps/i386/fpu/s_logbf.S new file mode 100644 index 0000000000..91eb3d2925 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_logbf.S @@ -0,0 +1,16 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_logbf.S,v 1.3 1995/05/09 00:15:12 jtc Exp $") + +ENTRY(__logbf) + flds 4(%esp) + fxtract + fstp %st + ret +END (__logbf) +weak_alias (__logbf, logbf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_logbl.c b/REORG.TODO/sysdeps/i386/fpu/s_logbl.c new file mode 100644 index 0000000000..391e2db489 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_logbl.c @@ -0,0 +1,19 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Public domain. + */ + +#include <math_private.h> + +long double +__logbl (long double x) +{ + long double res; + + asm ("fxtract\n" + "fstp %%st" : "=t" (res) : "0" (x)); + return res; +} + +weak_alias (__logbl, logbl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrint.S b/REORG.TODO/sysdeps/i386/fpu/s_lrint.S new file mode 100644 index 0000000000..79a374b399 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_lrint.S @@ -0,0 +1,34 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__lrint) + fldl 4(%esp) + subl $4, %esp + cfi_adjust_cfa_offset (4) + fistpl (%esp) + fwait + popl %eax + cfi_adjust_cfa_offset (-4) + ret +END(__lrint) +weak_alias (__lrint, lrint) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S b/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S new file mode 100644 index 0000000000..fc6e68e073 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_lrintf.S @@ -0,0 +1,34 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__lrintf) + flds 4(%esp) + subl $4, %esp + cfi_adjust_cfa_offset (4) + fistpl (%esp) + fwait + popl %eax + cfi_adjust_cfa_offset (-4) + ret +END(__lrintf) +weak_alias (__lrintf, lrintf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S b/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S new file mode 100644 index 0000000000..ba6dbdf44c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_lrintl.S @@ -0,0 +1,34 @@ +/* Round argument to nearest integral value according to current rounding + direction. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__lrintl) + fldt 4(%esp) + subl $4, %esp + cfi_adjust_cfa_offset (4) + fistpl (%esp) + fwait + popl %eax + cfi_adjust_cfa_offset (-4) + ret +END(__lrintl) +weak_alias (__lrintl, lrintl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S new file mode 100644 index 0000000000..f7b79b6ff2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyint.S @@ -0,0 +1,20 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ +/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>. */ + +#include <machine/asm.h> + +ENTRY(__nearbyint) + fldl 4(%esp) + subl $32, %esp + cfi_adjust_cfa_offset (32) + fnstenv 4(%esp) + frndint + fldenv 4(%esp) + addl $32, %esp + cfi_adjust_cfa_offset (-32) + ret +END (__nearbyint) +weak_alias (__nearbyint, nearbyint) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S new file mode 100644 index 0000000000..92df2f87b3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintf.S @@ -0,0 +1,20 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ +/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>. */ + +#include <machine/asm.h> + +ENTRY(__nearbyintf) + flds 4(%esp) + subl $32, %esp + cfi_adjust_cfa_offset (32) + fnstenv 4(%esp) + frndint + fldenv 4(%esp) + addl $32, %esp + cfi_adjust_cfa_offset (-32) + ret +END (__nearbyintf) +weak_alias (__nearbyintf, nearbyintf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S new file mode 100644 index 0000000000..3b7d1e2436 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_nearbyintl.S @@ -0,0 +1,23 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ +/* Adapted for use as nearbyint by Ulrich Drepper <drepper@cygnus.com>. */ + +#include <machine/asm.h> + +ENTRY(__nearbyintl) + fldt 4(%esp) + subl $32, %esp + cfi_adjust_cfa_offset (32) + fnstenv 4(%esp) + frndint + fnstsw + andl $0x1, %eax + orl %eax, 8(%esp) + fldenv 4(%esp) + addl $32, %esp + cfi_adjust_cfa_offset (-32) + ret +END (__nearbyintl) +weak_alias (__nearbyintl, nearbyintl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c b/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c new file mode 100644 index 0000000000..600ad7a8d3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_nextafterl.c @@ -0,0 +1,125 @@ +/* s_nextafterl.c -- long double version of s_nextafter.c. + * Special version for i387. + * Conversion to long double by Ulrich Drepper, + * Cygnus Support, drepper@cygnus.com. + */ + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +#if defined(LIBM_SCCS) && !defined(lint) +static char rcsid[] = "$NetBSD: $"; +#endif + +/* IEEE functions + * nextafterl(x,y) + * return the next machine floating-point number of x in the + * direction toward y. + * Special cases: + */ + +#include <errno.h> +#include <math.h> +#include <math_private.h> + +long double __nextafterl(long double x, long double y) +{ + u_int32_t hx,hy,ix,iy; + u_int32_t lx,ly; + int32_t esx,esy; + + GET_LDOUBLE_WORDS(esx,hx,lx,x); + GET_LDOUBLE_WORDS(esy,hy,ly,y); + ix = esx&0x7fff; /* |x| */ + iy = esy&0x7fff; /* |y| */ + + /* Intel's extended format has the normally implicit 1 explicit + present. Sigh! */ + if(((ix==0x7fff)&&(((hx&0x7fffffff)|lx)!=0)) || /* x is nan */ + ((iy==0x7fff)&&(((hy&0x7fffffff)|ly)!=0))) /* y is nan */ + return x+y; + if(x==y) return y; /* x=y, return y */ + if((ix|hx|lx)==0) { /* x == 0 */ + long double u; + SET_LDOUBLE_WORDS(x,esy&0x8000,0,1);/* return +-minsubnormal */ + u = math_opt_barrier (x); + u = u * u; + math_force_eval (u); /* raise underflow flag */ + return x; + } + if(esx>=0) { /* x > 0 */ + if(esx>esy||((esx==esy) && (hx>hy||((hx==hy)&&(lx>ly))))) { + /* x > y, x -= ulp */ + if(lx==0) { + if (hx <= 0x80000000) { + if (esx == 0) { + --hx; + } else { + esx -= 1; + hx = hx - 1; + if (esx > 0) + hx |= 0x80000000; + } + } else + hx -= 1; + } + lx -= 1; + } else { /* x < y, x += ulp */ + lx += 1; + if(lx==0) { + hx += 1; + if (hx==0 || (esx == 0 && hx == 0x80000000)) { + esx += 1; + hx |= 0x80000000; + } + } + } + } else { /* x < 0 */ + if(esy>=0||(esx>esy||((esx==esy)&&(hx>hy||((hx==hy)&&(lx>ly)))))){ + /* x < y, x -= ulp */ + if(lx==0) { + if (hx <= 0x80000000 && esx != 0xffff8000) { + esx -= 1; + hx = hx - 1; + if ((esx&0x7fff) > 0) + hx |= 0x80000000; + } else + hx -= 1; + } + lx -= 1; + } else { /* x > y, x += ulp */ + lx += 1; + if(lx==0) { + hx += 1; + if (hx==0 || (esx == 0xffff8000 && hx == 0x80000000)) { + esx += 1; + hx |= 0x80000000; + } + } + } + } + esy = esx&0x7fff; + if(esy==0x7fff) { + long double u = x + x; /* overflow */ + math_force_eval (u); + __set_errno (ERANGE); + } + if(esy==0) { + long double u = x*x; /* underflow */ + math_force_eval (u); /* raise underflow flag */ + __set_errno (ERANGE); + } + SET_LDOUBLE_WORDS(x,esx,hx,lx); + return x; +} +weak_alias (__nextafterl, nextafterl) +strong_alias (__nextafterl, __nexttowardl) +weak_alias (__nextafterl, nexttowardl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c b/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c new file mode 100644 index 0000000000..0b47044760 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_nexttoward.c @@ -0,0 +1,93 @@ +/* s_nexttoward.c + * Special i387 version + * Conversion from s_nextafter.c by Ulrich Drepper, Cygnus Support, + * drepper@cygnus.com. + */ + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +#if defined(LIBM_SCCS) && !defined(lint) +static char rcsid[] = "$NetBSD: $"; +#endif + +/* IEEE functions + * nexttoward(x,y) + * return the next machine floating-point number of x in the + * direction toward y. + * Special cases: + */ + +#include <errno.h> +#include <math.h> +#include <math_private.h> +#include <float.h> + +double __nexttoward(double x, long double y) +{ + int32_t hx,ix,iy; + u_int32_t lx,hy,ly,esy; + + EXTRACT_WORDS(hx,lx,x); + GET_LDOUBLE_WORDS(esy,hy,ly,y); + ix = hx&0x7fffffff; /* |x| */ + iy = esy&0x7fff; /* |y| */ + + /* Intel's extended format has the normally implicit 1 explicit + present. Sigh! */ + if(((ix>=0x7ff00000)&&((ix-0x7ff00000)|lx)!=0) || /* x is nan */ + ((iy>=0x7fff)&&((hy&0x7fffffff)|ly)!=0)) /* y is nan */ + return x+y; + if((long double) x==y) return y; /* x=y, return y */ + if((ix|lx)==0) { /* x == 0 */ + double u; + INSERT_WORDS(x,(esy&0x8000)<<16,1); /* return +-minsub */ + u = math_opt_barrier (x); + u = u * u; + math_force_eval (u); /* raise underflow flag */ + return x; + } + if(hx>=0) { /* x > 0 */ + if (x > y) { /* x -= ulp */ + if(lx==0) hx -= 1; + lx -= 1; + } else { /* x < y, x += ulp */ + lx += 1; + if(lx==0) hx += 1; + } + } else { /* x < 0 */ + if (x < y) { /* x -= ulp */ + if(lx==0) hx -= 1; + lx -= 1; + } else { /* x > y, x += ulp */ + lx += 1; + if(lx==0) hx += 1; + } + } + hy = hx&0x7ff00000; + if(hy>=0x7ff00000) { + double u = x+x; /* overflow */ + math_force_eval (u); + __set_errno (ERANGE); + } + if(hy<0x00100000) { + double u = x*x; /* underflow */ + math_force_eval (u); /* raise underflow flag */ + __set_errno (ERANGE); + } + INSERT_WORDS(x,hx,lx); + return x; +} +weak_alias (__nexttoward, nexttoward) +#ifdef NO_LONG_DOUBLE +strong_alias (__nexttoward, __nexttowardl) +weak_alias (__nexttoward, nexttowardl) +#endif diff --git a/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c b/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c new file mode 100644 index 0000000000..e1156d1e4f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_nexttowardf.c @@ -0,0 +1,77 @@ +/* s_nexttowardf.c -- float version of s_nextafter.c. + * Special i387 version. + * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com. + */ + +/* + * ==================================================== + * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunPro, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + +#if defined(LIBM_SCCS) && !defined(lint) +static char rcsid[] = "$NetBSD: $"; +#endif + +#include <errno.h> +#include <math.h> +#include <math_private.h> +#include <float.h> + +float __nexttowardf(float x, long double y) +{ + int32_t hx,ix,iy; + u_int32_t hy,ly,esy; + + GET_FLOAT_WORD(hx,x); + GET_LDOUBLE_WORDS(esy,hy,ly,y); + ix = hx&0x7fffffff; /* |x| */ + iy = esy&0x7fff; /* |y| */ + + /* Intel's extended format has the normally implicit 1 explicit + present. Sigh! */ + if((ix>0x7f800000) || /* x is nan */ + (iy>=0x7fff&&(((hy&0x7fffffff)|ly)!=0))) /* y is nan */ + return x+y; + if((long double) x==y) return y; /* x=y, return y */ + if(ix==0) { /* x == 0 */ + float u; + SET_FLOAT_WORD(x,((esy&0x8000)<<16)|1);/* return +-minsub*/ + u = math_opt_barrier (x); + u = u * u; + math_force_eval (u); /* raise underflow flag */ + return x; + } + if(hx>=0) { /* x > 0 */ + if(x > y) { /* x -= ulp */ + hx -= 1; + } else { /* x < y, x += ulp */ + hx += 1; + } + } else { /* x < 0 */ + if(x < y) { /* x -= ulp */ + hx -= 1; + } else { /* x > y, x += ulp */ + hx += 1; + } + } + hy = hx&0x7f800000; + if(hy>=0x7f800000) { + float u = x+x; /* overflow */ + math_force_eval (u); + __set_errno (ERANGE); + } + if(hy<0x00800000) { + float u = x*x; /* underflow */ + math_force_eval (u); /* raise underflow flag */ + __set_errno (ERANGE); + } + SET_FLOAT_WORD(x,hx); + return x; +} +weak_alias (__nexttowardf, nexttowardf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquo.S b/REORG.TODO/sysdeps/i386/fpu/s_remquo.S new file mode 100644 index 0000000000..341285db30 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_remquo.S @@ -0,0 +1,45 @@ +/* + * Written by Ulrich Drepper <drepper@cygnus.com>. + * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +#define PARMS 4 /* no space for saved regs */ +#define DVDND PARMS +#define DVSOR DVDND+8 +#define QUOP DVSOR+8 + + .text +ENTRY (__remquo) + + fldl DVSOR(%esp) + fldl DVDND(%esp) +1: fprem1 + fstsw %ax + sahf + jp 1b + fstp %st(1) + /* Compute the congruent of the quotient. */ + movl %eax, %ecx + shrl $8, %eax + shrl $12, %ecx + andl $4, %ecx + andl $3, %eax + orl %eax, %ecx + leal (%ecx,%ecx,2),%ecx + movl $0xef2a60, %eax + shrl %cl, %eax + andl $7, %eax + movl QUOP(%esp), %ecx + movl DVDND+4(%esp), %edx + xorl DVSOR+4(%esp), %edx + testl $0x80000000, %edx + jz 1f + negl %eax +1: movl %eax, (%ecx) + + ret +END (__remquo) +weak_alias (__remquo, remquo) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquof.S b/REORG.TODO/sysdeps/i386/fpu/s_remquof.S new file mode 100644 index 0000000000..62063f068f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_remquof.S @@ -0,0 +1,45 @@ +/* + * Written by Ulrich Drepper <drepper@cygnus.com>. + * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +#define PARMS 4 /* no space for saved regs */ +#define DVDND PARMS +#define DVSOR DVDND+4 +#define QUOP DVSOR+4 + + .text +ENTRY (__remquof) + + flds DVSOR(%esp) + flds DVDND(%esp) +1: fprem1 + fstsw %ax + sahf + jp 1b + fstp %st(1) + /* Compute the congruent of the quotient. */ + movl %eax, %ecx + shrl $8, %eax + shrl $12, %ecx + andl $4, %ecx + andl $3, %eax + orl %eax, %ecx + leal (%ecx,%ecx,2),%ecx + movl $0xef2a60, %eax + shrl %cl, %eax + andl $7, %eax + movl QUOP(%esp), %ecx + movl DVDND(%esp), %edx + xorl DVSOR(%esp), %edx + testl $0x80000000, %edx + jz 1f + negl %eax +1: movl %eax, (%ecx) + + ret +END (__remquof) +weak_alias (__remquof, remquof) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_remquol.S b/REORG.TODO/sysdeps/i386/fpu/s_remquol.S new file mode 100644 index 0000000000..f3d84fc7c2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_remquol.S @@ -0,0 +1,45 @@ +/* + * Written by Ulrich Drepper <drepper@cygnus.com>. + * Based on e_remainder by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +#define PARMS 4 /* no space for saved regs */ +#define DVDND PARMS +#define DVSOR DVDND+12 +#define QUOP DVSOR+12 + + .text +ENTRY (__remquol) + + fldt DVSOR(%esp) + fldt DVDND(%esp) +1: fprem1 + fstsw %ax + sahf + jp 1b + fstp %st(1) + /* Compute the congruent of the quotient. */ + movl %eax, %ecx + shrl $8, %eax + shrl $12, %ecx + andl $4, %ecx + andl $3, %eax + orl %eax, %ecx + leal (%ecx,%ecx,2),%ecx + movl $0xef2a60, %eax + shrl %cl, %eax + andl $7, %eax + movl QUOP(%esp), %ecx + movl DVDND+8(%esp), %edx + xorl DVSOR+8(%esp), %edx + testl $0x8000, %edx + jz 1f + negl %eax +1: movl %eax, (%ecx) + + ret +END (__remquol) +weak_alias (__remquol, remquol) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rint.S b/REORG.TODO/sysdeps/i386/fpu/s_rint.S new file mode 100644 index 0000000000..be36c5f0ca --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_rint.S @@ -0,0 +1,15 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_rint.S,v 1.4 1995/05/09 00:16:08 jtc Exp $") + +ENTRY(__rint) + fldl 4(%esp) + frndint + ret +END (__rint) +weak_alias (__rint, rint) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rintf.S b/REORG.TODO/sysdeps/i386/fpu/s_rintf.S new file mode 100644 index 0000000000..2b358c1cf1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_rintf.S @@ -0,0 +1,15 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_rintf.S,v 1.3 1995/05/09 00:17:22 jtc Exp $") + +ENTRY(__rintf) + flds 4(%esp) + frndint + ret +END (__rintf) +weak_alias (__rintf, rintf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_rintl.c b/REORG.TODO/sysdeps/i386/fpu/s_rintl.c new file mode 100644 index 0000000000..66af9cb675 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_rintl.c @@ -0,0 +1,18 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Public domain. + */ + +#include <math_private.h> + +long double +__rintl (long double x) +{ + long double res; + + asm ("frndint" : "=t" (res) : "0" (x)); + return res; +} + +weak_alias (__rintl, rintl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c b/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c new file mode 100644 index 0000000000..1009713fbc --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbln.c @@ -0,0 +1,2 @@ +/* Nothing to do. This function is the same as scalbn. So we define an + alias. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c b/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c new file mode 100644 index 0000000000..5e558c3540 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_scalblnf.c @@ -0,0 +1,2 @@ +/* Nothing to do. This function is the same as scalbnf. So we define an + alias. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c b/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c new file mode 100644 index 0000000000..cda2ec11c8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_scalblnl.c @@ -0,0 +1,2 @@ +/* Nothing to do. This function is the same as scalbnl. So we define an + alias. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S new file mode 100644 index 0000000000..4e90903115 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbn.S @@ -0,0 +1,24 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +RCSID("$NetBSD: s_scalbn.S,v 1.4 1995/05/09 00:19:06 jtc Exp $") + +ENTRY(__scalbn) + fildl 12(%esp) + fldl 4(%esp) + fscale + fstp %st(1) + DBL_NARROW_EVAL + ret +END (__scalbn) +strong_alias (__scalbn, __scalbln) + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20) +compat_symbol (libc, __scalbn, scalbln, GLIBC_2_1); +#endif diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S new file mode 100644 index 0000000000..f8353c4c75 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbnf.S @@ -0,0 +1,24 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> +#include <i386-math-asm.h> + +RCSID("$NetBSD: s_scalbnf.S,v 1.3 1995/05/09 00:19:59 jtc Exp $") + +ENTRY(__scalbnf) + fildl 8(%esp) + flds 4(%esp) + fscale + fstp %st(1) + FLT_NARROW_EVAL + ret +END (__scalbnf) +strong_alias (__scalbnf, __scalblnf) + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20) +compat_symbol (libc, __scalbnf, scalblnf, GLIBC_2_1); +#endif diff --git a/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S b/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S new file mode 100644 index 0000000000..839b5ff353 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_scalbnl.S @@ -0,0 +1,23 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: $") + +ENTRY(__scalbnl) + fildl 16(%esp) + fldt 4(%esp) + fscale + fstp %st(1) + ret +END (__scalbnl) +strong_alias (__scalbnl, __scalblnl) + +#include <shlib-compat.h> +#if SHLIB_COMPAT (libc, GLIBC_2_1, GLIBC_2_20) +compat_symbol (libc, __scalbnl, scalblnl, GLIBC_2_1); +#endif diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significand.S b/REORG.TODO/sysdeps/i386/fpu/s_significand.S new file mode 100644 index 0000000000..4859b7ed71 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_significand.S @@ -0,0 +1,16 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_significand.S,v 1.4 1995/05/09 00:21:47 jtc Exp $") + +ENTRY(__significand) + fldl 4(%esp) + fxtract + fstp %st(1) + ret +END (__significand) +weak_alias (__significand, significand) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significandf.S b/REORG.TODO/sysdeps/i386/fpu/s_significandf.S new file mode 100644 index 0000000000..3a2de97759 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_significandf.S @@ -0,0 +1,16 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + */ + +#include <machine/asm.h> + +RCSID("$NetBSD: s_significandf.S,v 1.3 1995/05/09 00:24:07 jtc Exp $") + +ENTRY(__significandf) + flds 4(%esp) + fxtract + fstp %st(1) + ret +END (__significandf) +weak_alias (__significandf, significandf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_significandl.c b/REORG.TODO/sysdeps/i386/fpu/s_significandl.c new file mode 100644 index 0000000000..b8cb093502 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_significandl.c @@ -0,0 +1,19 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Changes for long double by Ulrich Drepper <drepper@cygnus.com> + * Public domain. + */ + +#include <math_private.h> + +long double +__significandl (long double x) +{ + long double res; + + asm ("fxtract\n" + "fstp %%st(1)" : "=t" (res) : "0" (x)); + return res; +} + +weak_alias (__significandl, significandl) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_trunc.S b/REORG.TODO/sysdeps/i386/fpu/s_trunc.S new file mode 100644 index 0000000000..e9a850b877 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_trunc.S @@ -0,0 +1,37 @@ +/* Truncate double value. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + +ENTRY(__trunc) + fldl 4(%esp) + subl $32, %esp + cfi_adjust_cfa_offset (32) + fnstenv 4(%esp) + movl $0xc00, %edx + orl 4(%esp), %edx + movl %edx, (%esp) + fldcw (%esp) + frndint + fldenv 4(%esp) + addl $32, %esp + cfi_adjust_cfa_offset (-32) + ret +END(__trunc) +weak_alias (__trunc, trunc) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_truncf.S b/REORG.TODO/sysdeps/i386/fpu/s_truncf.S new file mode 100644 index 0000000000..a93f5b9a2e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_truncf.S @@ -0,0 +1,37 @@ +/* Truncate float value. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + +ENTRY(__truncf) + flds 4(%esp) + subl $32, %esp + cfi_adjust_cfa_offset (32) + fnstenv 4(%esp) + movl $0xc00, %edx + orl 4(%esp), %edx + movl %edx, (%esp) + fldcw (%esp) + frndint + fldenv 4(%esp) + addl $32, %esp + cfi_adjust_cfa_offset (-32) + ret +END(__truncf) +weak_alias (__truncf, truncf) diff --git a/REORG.TODO/sysdeps/i386/fpu/s_truncl.S b/REORG.TODO/sysdeps/i386/fpu/s_truncl.S new file mode 100644 index 0000000000..a884123612 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/s_truncl.S @@ -0,0 +1,40 @@ +/* Truncate long double value. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <machine/asm.h> + +ENTRY(__truncl) + fldt 4(%esp) + subl $32, %esp + cfi_adjust_cfa_offset (32) + fnstenv 4(%esp) + movl $0xc00, %edx + orl 4(%esp), %edx + movl %edx, (%esp) + fldcw (%esp) + frndint + fnstsw + andl $0x1, %eax + orl %eax, 8(%esp) + fldenv 4(%esp) + addl $32, %esp + cfi_adjust_cfa_offset (-32) + ret +END(__truncl) +weak_alias (__truncl, truncl) diff --git a/REORG.TODO/sysdeps/i386/fpu/slowexp.c b/REORG.TODO/sysdeps/i386/fpu/slowexp.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/slowexp.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/slowpow.c b/REORG.TODO/sysdeps/i386/fpu/slowpow.c new file mode 100644 index 0000000000..1cc8931700 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/slowpow.c @@ -0,0 +1 @@ +/* Not needed. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/t_exp.c b/REORG.TODO/sysdeps/i386/fpu/t_exp.c new file mode 100644 index 0000000000..fd37963b05 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/t_exp.c @@ -0,0 +1 @@ +/* Empty. Not needed. */ diff --git a/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c b/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c new file mode 100644 index 0000000000..ddd36d0964 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/fpu/w_sqrt_compat.c @@ -0,0 +1,8 @@ +/* The inline __ieee754_sqrt is not correctly rounding; it's OK for + most internal uses in glibc, but not for sqrt itself. */ +#define __ieee754_sqrt __avoid_ieee754_sqrt +#include <math.h> +#include <math_private.h> +#undef __ieee754_sqrt +extern double __ieee754_sqrt (double); +#include <math/w_sqrt_compat.c> diff --git a/REORG.TODO/sysdeps/i386/gccframe.h b/REORG.TODO/sysdeps/i386/gccframe.h new file mode 100644 index 0000000000..579da40ae9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/gccframe.h @@ -0,0 +1,27 @@ +/* Definition of object in frame unwind info. i386 version. + Copyright (C) 2001-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define DWARF_FRAME_REGISTERS 17 + +#define CRT_GET_RFIB_DATA(BASE) \ + { \ + register void *__ebx __asm__("ebx");\ + BASE = __ebx; \ + } + +#include <sysdeps/generic/gccframe.h> diff --git a/REORG.TODO/sysdeps/i386/gmp-mparam.h b/REORG.TODO/sysdeps/i386/gmp-mparam.h new file mode 100644 index 0000000000..7ea503a403 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/gmp-mparam.h @@ -0,0 +1,28 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991-2017 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 2.1 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, see +<http://www.gnu.org/licenses/>. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +#define IEEE_DOUBLE_BIG_ENDIAN 0 diff --git a/REORG.TODO/sysdeps/i386/htonl.S b/REORG.TODO/sysdeps/i386/htonl.S new file mode 100644 index 0000000000..63279bb6e1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/htonl.S @@ -0,0 +1,34 @@ +/* Change byte order in word. For Intel 80x86, x >= 4. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + word (sp + 4) +*/ + + .text +ENTRY (htonl) + movl 4(%esp), %eax + bswap %eax + ret +END (htonl) + +weak_alias (htonl, ntohl) diff --git a/REORG.TODO/sysdeps/i386/htons.S b/REORG.TODO/sysdeps/i386/htons.S new file mode 100644 index 0000000000..a3c53a9944 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/htons.S @@ -0,0 +1,35 @@ +/* Change byte order in word. For Intel 80x86, x >= 3. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + word (sp + 4) +*/ + + .text +ENTRY (htons) + movl 4(%esp), %eax + andl $0xffff, %eax + rorw $8, %ax + ret +END (htons) + +weak_alias (htons, ntohs) diff --git a/REORG.TODO/sysdeps/i386/i386-mcount.S b/REORG.TODO/sysdeps/i386/i386-mcount.S new file mode 100644 index 0000000000..733b8c78e7 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i386-mcount.S @@ -0,0 +1,79 @@ +/* i386-specific implementation of profiling support. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* We need a special version of the `mcount' function since for ix86 it + must not clobber any register. This has several reasons: + - there is a bug in gcc as of version 2.7.2.2 which prohibits the + use of profiling together with nested functions + - the ELF `fixup' function uses GCC's regparm feature + - some (future) systems might want to pass parameters in registers. */ + + .globl C_SYMBOL_NAME(_mcount) + .type C_SYMBOL_NAME(_mcount), @function + .align ALIGNARG(4) +C_LABEL(_mcount) + /* Save the caller-clobbered registers. */ + pushl %eax + pushl %ecx + pushl %edx + + movl 12(%esp), %edx + movl 4(%ebp), %eax + + /* No need to access the PLT or GOT, __mcount_internal is an + internal function and we can make a relative call. */ + call C_SYMBOL_NAME(__mcount_internal) + + /* Pop the saved registers. Please note that `mcount' has no + return value. */ + popl %edx + popl %ecx + popl %eax + ret + ASM_SIZE_DIRECTIVE(C_SYMBOL_NAME(_mcount)) + +#undef mcount +weak_alias (_mcount, mcount) + + /* Same as above, but doesn't require a frame pointer */ + .globl C_SYMBOL_NAME(__fentry__) + .type C_SYMBOL_NAME(__fentry__), @function + .align ALIGNARG(4) +C_LABEL(__fentry__) + /* Save the caller-clobbered registers. */ + pushl %eax + pushl %ecx + pushl %edx + + movl 12(%esp), %edx + movl 16(%esp), %eax + + /* No need to access the PLT or GOT, __mcount_internal is an + internal function and we can make a relative call. */ + call C_SYMBOL_NAME(__mcount_internal) + + /* Pop the saved registers. Please note that `__fentry__' has no + return value. */ + popl %edx + popl %ecx + popl %eax + ret + ASM_SIZE_DIRECTIVE(C_SYMBOL_NAME(__fentry__)) diff --git a/REORG.TODO/sysdeps/i386/i586/add_n.S b/REORG.TODO/sysdeps/i386/i586/add_n.S new file mode 100644 index 0000000000..f73df092f0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/add_n.S @@ -0,0 +1,143 @@ +/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define S2 S1+4 +#define SIZE S2+4 + + .text +ENTRY (__mpn_add_n) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 12) + movl S1(%esp),%esi + cfi_rel_offset (esi, 8) + movl S2(%esp),%ebx + cfi_rel_offset (ebx, 0) + movl SIZE(%esp),%ecx + movl (%ebx),%ebp + cfi_rel_offset (ebp, 4) + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx /* zero carry flag */ + jz L(end) + pushl %edx + cfi_adjust_cfa_offset (4) + + ALIGN (3) +L(oop): movl 28(%edi),%eax /* fetch destination cache line */ + leal 32(%edi),%edi + +L(1): movl (%esi),%eax + movl 4(%esi),%edx + adcl %ebp,%eax + movl 4(%ebx),%ebp + adcl %ebp,%edx + movl 8(%ebx),%ebp + movl %eax,-32(%edi) + movl %edx,-28(%edi) + +L(2): movl 8(%esi),%eax + movl 12(%esi),%edx + adcl %ebp,%eax + movl 12(%ebx),%ebp + adcl %ebp,%edx + movl 16(%ebx),%ebp + movl %eax,-24(%edi) + movl %edx,-20(%edi) + +L(3): movl 16(%esi),%eax + movl 20(%esi),%edx + adcl %ebp,%eax + movl 20(%ebx),%ebp + adcl %ebp,%edx + movl 24(%ebx),%ebp + movl %eax,-16(%edi) + movl %edx,-12(%edi) + +L(4): movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %ebp,%eax + movl 28(%ebx),%ebp + adcl %ebp,%edx + movl 32(%ebx),%ebp + movl %eax,-8(%edi) + movl %edx,-4(%edi) + + leal 32(%esi),%esi + leal 32(%ebx),%ebx + decl %ecx + jnz L(oop) + + popl %edx + cfi_adjust_cfa_offset (-4) +L(end): + decl %edx /* test %edx w/o clobbering carry */ + js L(end2) + incl %edx +L(oop2): + leal 4(%edi),%edi + movl (%esi),%eax + adcl %ebp,%eax + movl 4(%ebx),%ebp + movl %eax,-4(%edi) + leal 4(%esi),%esi + leal 4(%ebx),%ebx + decl %edx + jnz L(oop2) +L(end2): + movl (%esi),%eax + adcl %ebp,%eax + movl %eax,(%edi) + + sbbl %eax,%eax + negl %eax + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_add_n) diff --git a/REORG.TODO/sysdeps/i386/i586/addmul_1.S b/REORG.TODO/sysdeps/i386/i586/addmul_1.S new file mode 100644 index 0000000000..a713192982 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/addmul_1.S @@ -0,0 +1,94 @@ +/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add + the result to a second limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define SIZE S1+4 +#define S2LIMB SIZE+4 + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebx + + .text +ENTRY (__mpn_addmul_1) + + pushl %res_ptr + cfi_adjust_cfa_offset (4) + pushl %s1_ptr + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %s2_limb + cfi_adjust_cfa_offset (4) + + movl RES(%esp), %res_ptr + cfi_rel_offset (res_ptr, 12) + movl S1(%esp), %s1_ptr + cfi_rel_offset (s1_ptr, 8) + movl SIZE(%esp), %size + movl S2LIMB(%esp), %s2_limb + cfi_rel_offset (s2_limb, 0) + leal (%res_ptr,%size,4), %res_ptr + leal (%s1_ptr,%size,4), %s1_ptr + negl %size + xorl %ebp, %ebp + cfi_rel_offset (ebp, 4) + ALIGN (3) + +L(oop): adcl $0, %ebp + movl (%s1_ptr,%size,4), %eax + + mull %s2_limb + + addl %ebp, %eax + movl (%res_ptr,%size,4), %ebp + + adcl $0, %edx + addl %eax, %ebp + + movl %ebp, (%res_ptr,%size,4) + incl %size + + movl %edx, %ebp + jnz L(oop) + + adcl $0, %ebp + movl %ebp, %eax + popl %s2_limb + cfi_adjust_cfa_offset (-4) + cfi_restore (s2_limb) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %s1_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (s1_ptr) + popl %res_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (res_ptr) + + ret +#undef size +END (__mpn_addmul_1) diff --git a/REORG.TODO/sysdeps/i386/i586/bzero.S b/REORG.TODO/sysdeps/i386/i586/bzero.S new file mode 100644 index 0000000000..2a106719a4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/bzero.S @@ -0,0 +1,4 @@ +#define USE_AS_BZERO +#define memset __bzero +#include <sysdeps/i386/i586/memset.S> +weak_alias (__bzero, bzero) diff --git a/REORG.TODO/sysdeps/i386/i586/init-arch.h b/REORG.TODO/sysdeps/i386/i586/init-arch.h new file mode 100644 index 0000000000..4711212e6f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/init-arch.h @@ -0,0 +1,19 @@ +/* Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define MINIMUM_ISA 586 +#include <sysdeps/x86/init-arch.h> diff --git a/REORG.TODO/sysdeps/i386/i586/lshift.S b/REORG.TODO/sysdeps/i386/i586/lshift.S new file mode 100644 index 0000000000..7941c28d9d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/lshift.S @@ -0,0 +1,255 @@ +/* Pentium optimized __mpn_lshift -- + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S RES+4 +#define SIZE S+4 +#define CNT SIZE+4 + + .text +ENTRY (__mpn_lshift) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebp, 0) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 12) + movl S(%esp),%esi + cfi_rel_offset (esi, 8) + movl SIZE(%esp),%ebx + cfi_rel_offset (ebx, 0) + movl CNT(%esp),%ecx + +/* We can use faster code for shift-by-1 under certain conditions. */ + cmp $1,%ecx + jne L(normal) + leal 4(%esi),%eax + cmpl %edi,%eax + jnc L(special) /* jump if s_ptr + 1 >= res_ptr */ + leal (%esi,%ebx,4),%eax + cmpl %eax,%edi + jnc L(special) /* jump if res_ptr >= s_ptr + size */ + +L(normal): + leal -4(%edi,%ebx,4),%edi + leal -4(%esi,%ebx,4),%esi + + movl (%esi),%edx + subl $4,%esi + xorl %eax,%eax + shldl %cl,%edx,%eax /* compute carry limb */ + pushl %eax /* push carry limb onto stack */ + cfi_adjust_cfa_offset (4) + + decl %ebx + pushl %ebx + cfi_adjust_cfa_offset (4) + shrl $3,%ebx + jz L(end) + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +L(oop): movl -28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebp + + movl (%esi),%eax + movl -4(%esi),%edx + shldl %cl,%eax,%ebp + shldl %cl,%edx,%eax + movl %ebp,(%edi) + movl %eax,-4(%edi) + + movl -8(%esi),%ebp + movl -12(%esi),%eax + shldl %cl,%ebp,%edx + shldl %cl,%eax,%ebp + movl %edx,-8(%edi) + movl %ebp,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebp + shldl %cl,%edx,%eax + shldl %cl,%ebp,%edx + movl %eax,-16(%edi) + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + shldl %cl,%eax,%ebp + shldl %cl,%edx,%eax + movl %ebp,-24(%edi) + movl %eax,-28(%edi) + + subl $32,%esi + subl $32,%edi + decl %ebx + jnz L(oop) + +L(end): popl %ebx + cfi_adjust_cfa_offset (-4) + andl $7,%ebx + jz L(end2) +L(oop2): + movl (%esi),%eax + shldl %cl,%eax,%edx + movl %edx,(%edi) + movl %eax,%edx + subl $4,%esi + subl $4,%edi + decl %ebx + jnz L(oop2) + +L(end2): + shll %cl,%edx /* compute least significant limb */ + movl %edx,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + cfi_adjust_cfa_offset (-4) + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret + +/* We loop from least significant end of the arrays, which is only + permissible if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. +*/ + + cfi_adjust_cfa_offset (16) + cfi_rel_offset (edi, 12) + cfi_rel_offset (esi, 8) + cfi_rel_offset (ebp, 4) + cfi_rel_offset (ebx, 0) +L(special): + movl (%esi),%edx + addl $4,%esi + + decl %ebx + pushl %ebx + cfi_adjust_cfa_offset (4) + shrl $3,%ebx + + addl %edx,%edx + incl %ebx + decl %ebx + jz L(Lend) + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +L(Loop): + movl 28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebp + + movl (%esi),%eax + movl 4(%esi),%edx + adcl %eax,%eax + movl %ebp,(%edi) + adcl %edx,%edx + movl %eax,4(%edi) + + movl 8(%esi),%ebp + movl 12(%esi),%eax + adcl %ebp,%ebp + movl %edx,8(%edi) + adcl %eax,%eax + movl %ebp,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebp + adcl %edx,%edx + movl %eax,16(%edi) + adcl %ebp,%ebp + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %eax,%eax + movl %ebp,24(%edi) + adcl %edx,%edx + movl %eax,28(%edi) + + leal 32(%esi),%esi /* use leal not to clobber carry */ + leal 32(%edi),%edi + decl %ebx + jnz L(Loop) + +L(Lend): + popl %ebx + cfi_adjust_cfa_offset (-4) + sbbl %eax,%eax /* save carry in %eax */ + andl $7,%ebx + jz L(Lend2) + addl %eax,%eax /* restore carry from eax */ +L(Loop2): + movl %edx,%ebp + movl (%esi),%edx + adcl %edx,%edx + movl %ebp,(%edi) + + leal 4(%esi),%esi /* use leal not to clobber carry */ + leal 4(%edi),%edi + decl %ebx + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax /* restore carry from eax */ +L(L1): movl %edx,(%edi) /* store last limb */ + + sbbl %eax,%eax + negl %eax + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_lshift) diff --git a/REORG.TODO/sysdeps/i386/i586/memcopy.h b/REORG.TODO/sysdeps/i386/i586/memcopy.h new file mode 100644 index 0000000000..39f020a746 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/memcopy.h @@ -0,0 +1,95 @@ +/* memcopy.h -- definitions for memory copy functions. Pentium version. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + Contributed by Torbjorn Granlund (tege@sics.se). + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Get the i386 definitions. We will override some of them below. */ +#include <sysdeps/i386/memcopy.h> + +/* Written like this, the Pentium pipeline can execute the loop at a + sustained rate of 2 instructions/clock, or asymptotically 480 + Mbytes/second at 60Mhz. */ + +#undef WORD_COPY_FWD +#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \ + do \ + { \ + asm volatile ("subl $32,%2\n" \ + "js 2f\n" \ + "movl 0(%0),%%edx\n" /* alloc dest line */ \ + "1:\n" \ + "movl 28(%0),%%eax\n" /* alloc dest line */ \ + "subl $32,%2\n" /* decr loop count */ \ + "movl 0(%1),%%eax\n" /* U pipe */ \ + "movl 4(%1),%%edx\n" /* V pipe */ \ + "movl %%eax,0(%0)\n" /* U pipe */ \ + "movl %%edx,4(%0)\n" /* V pipe */ \ + "movl 8(%1),%%eax\n" \ + "movl 12(%1),%%edx\n" \ + "movl %%eax,8(%0)\n" \ + "movl %%edx,12(%0)\n" \ + "movl 16(%1),%%eax\n" \ + "movl 20(%1),%%edx\n" \ + "movl %%eax,16(%0)\n" \ + "movl %%edx,20(%0)\n" \ + "movl 24(%1),%%eax\n" \ + "movl 28(%1),%%edx\n" \ + "movl %%eax,24(%0)\n" \ + "movl %%edx,28(%0)\n" \ + "leal 32(%1),%1\n" /* update src ptr */ \ + "leal 32(%0),%0\n" /* update dst ptr */ \ + "jns 1b\n" \ + "2: addl $32,%2" : \ + "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) : \ + "0" (dst_bp), "1" (src_bp), "2" (nbytes) : \ + "ax", "dx"); \ + } while (0) + +#undef WORD_COPY_BWD +#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes) \ + do \ + { \ + asm volatile ("subl $32,%2\n" \ + "js 2f\n" \ + "movl -4(%0),%%edx\n" \ + "1:\n" \ + "movl -32(%0),%%eax\n" \ + "subl $32,%2\n" \ + "movl -4(%1),%%eax\n" \ + "movl -8(%1),%%edx\n" \ + "movl %%eax,-4(%0)\n" \ + "movl %%edx,-8(%0)\n" \ + "movl -12(%1),%%eax\n" \ + "movl -16(%1),%%edx\n" \ + "movl %%eax,-12(%0)\n" \ + "movl %%edx,-16(%0)\n" \ + "movl -20(%1),%%eax\n" \ + "movl -24(%1),%%edx\n" \ + "movl %%eax,-20(%0)\n" \ + "movl %%edx,-24(%0)\n" \ + "movl -28(%1),%%eax\n" \ + "movl -32(%1),%%edx\n" \ + "movl %%eax,-28(%0)\n" \ + "movl %%edx,-32(%0)\n" \ + "leal -32(%1),%1\n" \ + "leal -32(%0),%0\n" \ + "jns 1b\n" \ + "2: addl $32,%2" : \ + "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) : \ + "0" (dst_ep), "1" (src_ep), "2" (nbytes) : \ + "ax", "dx"); \ + } while (0) diff --git a/REORG.TODO/sysdeps/i386/i586/memcpy.S b/REORG.TODO/sysdeps/i386/i586/memcpy.S new file mode 100644 index 0000000000..6474a3f653 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/memcpy.S @@ -0,0 +1,124 @@ +/* Highly optimized version for i586. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+8 /* space for 2 saved regs */ +#define RTN PARMS +#define DEST RTN +#define SRC DEST+4 +#define LEN SRC+4 + + .text +#if defined PIC && IS_IN (libc) +ENTRY (__memcpy_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memcpy_chk) +#endif +ENTRY (memcpy) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + + movl DEST(%esp), %edi + cfi_rel_offset (edi, 4) + movl SRC(%esp), %esi + cfi_rel_offset (esi, 0) + movl LEN(%esp), %ecx + movl %edi, %eax + + /* We need this in any case. */ + cld + + /* Cutoff for the big loop is a size of 32 bytes since otherwise + the loop will never be entered. */ + cmpl $32, %ecx + jbe L(1) + + negl %eax + andl $3, %eax + subl %eax, %ecx + xchgl %eax, %ecx + + rep; movsb + + movl %eax, %ecx + subl $32, %ecx + js L(2) + + /* Read ahead to make sure we write in the cache since the stupid + i586 designers haven't implemented read-on-write-miss. */ + movl (%edi), %eax +L(3): movl 28(%edi), %edx + + /* Now correct the loop counter. Please note that in the following + code the flags are not changed anymore. */ + subl $32, %ecx + + movl (%esi), %eax + movl 4(%esi), %edx + movl %eax, (%edi) + movl %edx, 4(%edi) + movl 8(%esi), %eax + movl 12(%esi), %edx + movl %eax, 8(%edi) + movl %edx, 12(%edi) + movl 16(%esi), %eax + movl 20(%esi), %edx + movl %eax, 16(%edi) + movl %edx, 20(%edi) + movl 24(%esi), %eax + movl 28(%esi), %edx + movl %eax, 24(%edi) + movl %edx, 28(%edi) + + leal 32(%esi), %esi + leal 32(%edi), %edi + + jns L(3) + + /* Correct extra loop counter modification. */ +L(2): addl $32, %ecx +#ifndef USE_AS_MEMPCPY + movl DEST(%esp), %eax +#endif + +L(1): rep; movsb + +#ifdef USE_AS_MEMPCPY + movl %edi, %eax +#endif + + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (memcpy) +#ifndef USE_AS_MEMPCPY +libc_hidden_builtin_def (memcpy) +#endif diff --git a/REORG.TODO/sysdeps/i386/i586/mempcpy.S b/REORG.TODO/sysdeps/i386/i586/mempcpy.S new file mode 100644 index 0000000000..720a4c0923 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/mempcpy.S @@ -0,0 +1,8 @@ +#define USE_AS_MEMPCPY +#define memcpy __mempcpy +#define __memcpy_chk __mempcpy_chk +#include <sysdeps/i386/i586/memcpy.S> + +libc_hidden_def (__mempcpy) +weak_alias (__mempcpy, mempcpy) +libc_hidden_builtin_def (mempcpy) diff --git a/REORG.TODO/sysdeps/i386/i586/memset.S b/REORG.TODO/sysdeps/i386/i586/memset.S new file mode 100644 index 0000000000..4f8f1bcf94 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/memset.S @@ -0,0 +1,121 @@ +/* memset/bzero -- set memory area to CH/0 + Highly optimized version for ix86, x>=5. + Copyright (C) 1996-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Torbjorn Granlund, <tege@matematik.su.se> + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* space for 1 saved reg */ +#define RTN PARMS +#define DEST RTN +#ifdef USE_AS_BZERO +# define LEN DEST+4 +#else +# define CHR DEST+4 +# define LEN CHR+4 +#endif + + .text +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO +ENTRY (__memset_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk) +#endif +ENTRY (memset) + + pushl %edi + cfi_adjust_cfa_offset (4) + + movl DEST(%esp), %edi + cfi_rel_offset (edi, 0) + movl LEN(%esp), %edx +#ifdef USE_AS_BZERO + xorl %eax, %eax /* we fill with 0 */ +#else + movb CHR(%esp), %al + movb %al, %ah + movl %eax, %ecx + shll $16, %eax + movw %cx, %ax +#endif + cld + +/* If less than 36 bytes to write, skip tricky code (it wouldn't work). */ + cmpl $36, %edx + movl %edx, %ecx /* needed when branch is taken! */ + jl L(2) + +/* First write 0-3 bytes to make the pointer 32-bit aligned. */ + movl %edi, %ecx /* Copy ptr to ecx... */ + negl %ecx /* ...and negate that and... */ + andl $3, %ecx /* ...mask to get byte count. */ + subl %ecx, %edx /* adjust global byte count */ + rep + stosb + + subl $32, %edx /* offset count for unrolled loop */ + movl (%edi), %ecx /* Fetch destination cache line */ + + .align 2, 0x90 /* supply 0x90 for broken assemblers */ +L(1): movl 28(%edi), %ecx /* allocate cache line for destination */ + subl $32, %edx /* decr loop count */ + movl %eax, 0(%edi) /* store words pairwise */ + movl %eax, 4(%edi) + movl %eax, 8(%edi) + movl %eax, 12(%edi) + movl %eax, 16(%edi) + movl %eax, 20(%edi) + movl %eax, 24(%edi) + movl %eax, 28(%edi) + leal 32(%edi), %edi /* update destination pointer */ + jge L(1) + + leal 32(%edx), %ecx /* reset offset count */ + +/* Write last 0-7 full 32-bit words (up to 8 words if loop was skipped). */ +L(2): shrl $2, %ecx /* convert byte count to longword count */ + rep + stosl + +/* Finally write the last 0-3 bytes. */ + movl %edx, %ecx + andl $3, %ecx + rep + stosb + +#ifndef USE_AS_BZERO + /* Load result (only if used as memset). */ + movl DEST(%esp), %eax /* start address of destination is result */ +#endif + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (memset) +libc_hidden_builtin_def (memset) + +#if defined SHARED && IS_IN (libc) && !defined __memset_chk \ + && !defined USE_AS_BZERO +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +#endif diff --git a/REORG.TODO/sysdeps/i386/i586/memusage.h b/REORG.TODO/sysdeps/i386/i586/memusage.h new file mode 100644 index 0000000000..c8170874d0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/memusage.h @@ -0,0 +1 @@ +#include "../i686/memusage.h" diff --git a/REORG.TODO/sysdeps/i386/i586/mul_1.S b/REORG.TODO/sysdeps/i386/i586/mul_1.S new file mode 100644 index 0000000000..bd3a07de90 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/mul_1.S @@ -0,0 +1,90 @@ +/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store + the result in a second limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define SIZE S1+4 +#define S2LIMB SIZE+4 + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebx + + .text +ENTRY (__mpn_mul_1) + + pushl %res_ptr + cfi_adjust_cfa_offset (4) + pushl %s1_ptr + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %s2_limb + cfi_adjust_cfa_offset (4) + + movl RES(%esp), %res_ptr + cfi_rel_offset (res_ptr, 12) + movl S1(%esp), %s1_ptr + cfi_rel_offset (s1_ptr, 8) + movl SIZE(%esp), %size + movl S2LIMB(%esp), %s2_limb + cfi_rel_offset (s2_limb, 0) + leal (%res_ptr,%size,4), %res_ptr + leal (%s1_ptr,%size,4), %s1_ptr + negl %size + xorl %ebp, %ebp + cfi_rel_offset (ebp, 4) + ALIGN (3) + +L(oop): adcl $0, %ebp + movl (%s1_ptr,%size,4), %eax + + mull %s2_limb + + addl %eax, %ebp + + movl %ebp, (%res_ptr,%size,4) + incl %size + + movl %edx, %ebp + jnz L(oop) + + adcl $0, %ebp + movl %ebp, %eax + popl %s2_limb + cfi_adjust_cfa_offset (-4) + cfi_restore (s2_limb) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %s1_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (s1_ptr) + popl %res_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (res_ptr) + + ret +#undef size +END (__mpn_mul_1) diff --git a/REORG.TODO/sysdeps/i386/i586/rshift.S b/REORG.TODO/sysdeps/i386/i586/rshift.S new file mode 100644 index 0000000000..24c76ee0bb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/rshift.S @@ -0,0 +1,255 @@ +/* Pentium optimized __mpn_rshift -- + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S RES+4 +#define SIZE S+4 +#define CNT SIZE+4 + + .text +ENTRY (__mpn_rshift) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebp, 0) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 12) + movl S(%esp),%esi + cfi_rel_offset (esi, 8) + movl SIZE(%esp),%ebx + cfi_rel_offset (ebx, 0) + movl CNT(%esp),%ecx + +/* We can use faster code for shift-by-1 under certain conditions. */ + cmp $1,%ecx + jne L(normal) + leal 4(%edi),%eax + cmpl %esi,%eax + jnc L(special) /* jump if res_ptr + 1 >= s_ptr */ + leal (%edi,%ebx,4),%eax + cmpl %eax,%esi + jnc L(special) /* jump if s_ptr >= res_ptr + size */ + +L(normal): + movl (%esi),%edx + addl $4,%esi + xorl %eax,%eax + shrdl %cl,%edx,%eax /* compute carry limb */ + pushl %eax /* push carry limb onto stack */ + cfi_adjust_cfa_offset (4) + + decl %ebx + pushl %ebx + cfi_adjust_cfa_offset (4) + shrl $3,%ebx + jz L(end) + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +L(oop): movl 28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebp + + movl (%esi),%eax + movl 4(%esi),%edx + shrdl %cl,%eax,%ebp + shrdl %cl,%edx,%eax + movl %ebp,(%edi) + movl %eax,4(%edi) + + movl 8(%esi),%ebp + movl 12(%esi),%eax + shrdl %cl,%ebp,%edx + shrdl %cl,%eax,%ebp + movl %edx,8(%edi) + movl %ebp,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebp + shrdl %cl,%edx,%eax + shrdl %cl,%ebp,%edx + movl %eax,16(%edi) + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + shrdl %cl,%eax,%ebp + shrdl %cl,%edx,%eax + movl %ebp,24(%edi) + movl %eax,28(%edi) + + addl $32,%esi + addl $32,%edi + decl %ebx + jnz L(oop) + +L(end): popl %ebx + cfi_adjust_cfa_offset (-4) + andl $7,%ebx + jz L(end2) +L(oop2): + movl (%esi),%eax + shrdl %cl,%eax,%edx /* compute result limb */ + movl %edx,(%edi) + movl %eax,%edx + addl $4,%esi + addl $4,%edi + decl %ebx + jnz L(oop2) + +L(end2): + shrl %cl,%edx /* compute most significant limb */ + movl %edx,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + cfi_adjust_cfa_offset (-4) + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret + +/* We loop from least significant end of the arrays, which is only + permissible if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. +*/ + + cfi_adjust_cfa_offset (16) + cfi_rel_offset (edi, 12) + cfi_rel_offset (esi, 8) + cfi_rel_offset (ebp, 4) + cfi_rel_offset (ebx, 0) +L(special): + leal -4(%edi,%ebx,4),%edi + leal -4(%esi,%ebx,4),%esi + + movl (%esi),%edx + subl $4,%esi + + decl %ebx + pushl %ebx + cfi_adjust_cfa_offset (4) + shrl $3,%ebx + + shrl $1,%edx + incl %ebx + decl %ebx + jz L(Lend) + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +L(Loop): + movl -28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebp + + movl (%esi),%eax + movl -4(%esi),%edx + rcrl $1,%eax + movl %ebp,(%edi) + rcrl $1,%edx + movl %eax,-4(%edi) + + movl -8(%esi),%ebp + movl -12(%esi),%eax + rcrl $1,%ebp + movl %edx,-8(%edi) + rcrl $1,%eax + movl %ebp,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebp + rcrl $1,%edx + movl %eax,-16(%edi) + rcrl $1,%ebp + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + rcrl $1,%eax + movl %ebp,-24(%edi) + rcrl $1,%edx + movl %eax,-28(%edi) + + leal -32(%esi),%esi /* use leal not to clobber carry */ + leal -32(%edi),%edi + decl %ebx + jnz L(Loop) + +L(Lend): + popl %ebx + cfi_adjust_cfa_offset (-4) + sbbl %eax,%eax /* save carry in %eax */ + andl $7,%ebx + jz L(Lend2) + addl %eax,%eax /* restore carry from eax */ +L(Loop2): + movl %edx,%ebp + movl (%esi),%edx + rcrl $1,%edx + movl %ebp,(%edi) + + leal -4(%esi),%esi /* use leal not to clobber carry */ + leal -4(%edi),%edi + decl %ebx + jnz L(Loop2) + + jmp L(L1) +L(Lend2): + addl %eax,%eax /* restore carry from eax */ +L(L1): movl %edx,(%edi) /* store last limb */ + + movl $0,%eax + rcrl $1,%eax + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_rshift) diff --git a/REORG.TODO/sysdeps/i386/i586/stpcpy.S b/REORG.TODO/sysdeps/i386/i586/stpcpy.S new file mode 100644 index 0000000000..8691efd01c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/stpcpy.S @@ -0,0 +1,8 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy + +#include <sysdeps/i386/i586/strcpy.S> + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/REORG.TODO/sysdeps/i386/i586/strchr.S b/REORG.TODO/sysdeps/i386/i586/strchr.S new file mode 100644 index 0000000000..02f66b8f72 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/strchr.S @@ -0,0 +1,348 @@ +/* Find character CH in a NUL terminated string. + Highly optimized version for ix85, x>=5. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to execute some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RTN PARMS +#define STR RTN +#define CHR STR+4 + + .text +ENTRY (strchr) + + pushl %edi /* Save callee-safe registers. */ + cfi_adjust_cfa_offset (-4) + pushl %esi + cfi_adjust_cfa_offset (-4) + + pushl %ebx + cfi_adjust_cfa_offset (-4) + pushl %ebp + cfi_adjust_cfa_offset (-4) + + movl STR(%esp), %eax + movl CHR(%esp), %edx + + movl %eax, %edi /* duplicate string pointer for later */ + cfi_rel_offset (edi, 12) + xorl %ecx, %ecx /* clear %ecx */ + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* now it is 0|0|c|c */ + movb %dl, %cl /* we construct the lower half in %ecx */ + + shll $16, %edx /* now %edx is c|c|0|0 */ + movb %cl, %ch /* now %ecx is 0|0|c|c */ + + orl %ecx, %edx /* and finally c|c|c|c */ + andl $3, %edi /* mask alignment bits */ + + jz L(11) /* alignment is 0 => start loop */ + + movb %dl, %cl /* 0 is needed below */ + jp L(0) /* exactly two bits set */ + + xorb (%eax), %cl /* is byte the one we are looking for? */ + jz L(out) /* yes => return pointer */ + + xorb %dl, %cl /* load single byte and test for NUL */ + je L(3) /* yes => return NULL */ + + movb 1(%eax), %cl /* load single byte */ + incl %eax + + cmpb %cl, %dl /* is byte == C? */ + je L(out) /* aligned => return pointer */ + + cmpb $0, %cl /* is byte NUL? */ + je L(3) /* yes => return NULL */ + + incl %eax + decl %edi + + jne L(11) + +L(0): movb (%eax), %cl /* load single byte */ + + cmpb %cl, %dl /* is byte == C? */ + je L(out) /* aligned => return pointer */ + + cmpb $0, %cl /* is byte NUL? */ + je L(3) /* yes => return NULL */ + + incl %eax /* increment pointer */ + + cfi_rel_offset (esi, 8) + cfi_rel_offset (ebx, 4) + cfi_rel_offset (ebp, 0) + + /* The following code is the preparation for the loop. The + four instruction up to `L1' will not be executed in the loop + because the same code is found at the end of the loop, but + there it is executed in parallel with other instructions. */ +L(11): movl (%eax), %ecx + movl $magic, %ebp + + movl $magic, %edi + addl %ecx, %ebp + + /* The main loop: it looks complex and indeed it is. I would + love to say `it was hard to write, so it should he hard to + read' but I will give some more hints. To fully understand + this code you should first take a look at the i486 version. + The basic algorithm is the same, but here the code organized + in a way which permits to use both pipelines all the time. + + I tried to make it a bit more understandable by indenting + the code according to stage in the algorithm. It goes as + follows: + check for 0 in 1st word + check for C in 1st word + check for 0 in 2nd word + check for C in 2nd word + check for 0 in 3rd word + check for C in 3rd word + check for 0 in 4th word + check for C in 4th word + + Please note that doing the test for NUL before the test for + C allows us to overlap the test for 0 in the next word with + the test for C. */ + +L(1): xorl %ecx, %ebp /* (word^magic) */ + addl %ecx, %edi /* add magic word */ + + leal 4(%eax), %eax /* increment pointer */ + jnc L(4) /* previous addl caused overflow? */ + + movl %ecx, %ebx /* duplicate original word */ + orl $magic, %ebp /* (word^magic)|magic */ + + addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */ + jne L(4) /* yes => we found word with NUL */ + + movl $magic, %esi /* load magic value */ + xorl %edx, %ebx /* clear words which are C */ + + movl (%eax), %ecx + addl %ebx, %esi /* (word+magic) */ + + movl $magic, %edi + jnc L(5) /* previous addl caused overflow? */ + + movl %edi, %ebp + xorl %ebx, %esi /* (word+magic)^word */ + + addl %ecx, %ebp + orl $magic, %esi /* ((word+magic)^word)|magic */ + + addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/ + jne L(5) /* yes => we found word with C */ + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L(4) + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L(4) + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L(5) + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + jne L(5) + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L(4) + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L(4) + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L(5) + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + jne L(5) + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L(4) + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L(4) + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L(5) + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + + je L(1) + + /* We know there is no NUL byte but a C byte in the word. + %ebx contains NUL in this particular byte. */ +L(5): subl $4, %eax /* adjust pointer */ + testb %bl, %bl /* first byte == C? */ + + jz L(out) /* yes => return pointer */ + + incl %eax /* increment pointer */ + testb %bh, %bh /* second byte == C? */ + + jz L(out) /* yes => return pointer */ + + shrl $16, %ebx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmp $0, %bl /* third byte == C */ + je L(out) /* yes => return pointer */ + + incl %eax /* increment pointer */ + +L(out): popl %ebp /* restore saved registers */ + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret + + cfi_adjust_cfa_offset (16) + cfi_rel_offset (edi, 12) + cfi_rel_offset (esi, 8) + cfi_rel_offset (ebx, 4) + cfi_rel_offset (ebp, 0) + /* We know there is a NUL byte in the word. But we have to test + whether there is an C byte before it in the word. */ +L(4): subl $4, %eax /* adjust pointer */ + cmpb %dl, %cl /* first byte == C? */ + + je L(out) /* yes => return pointer */ + + cmpb $0, %cl /* first byte == NUL? */ + je L(3) /* yes => return NULL */ + + incl %eax /* increment pointer */ + + cmpb %dl, %ch /* second byte == C? */ + je L(out) /* yes => return pointer */ + + cmpb $0, %ch /* second byte == NUL? */ + je L(3) /* yes => return NULL */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb %dl, %cl /* third byte == C? */ + je L(out) /* yes => return pointer */ + + cmpb $0, %cl /* third byte == NUL? */ + je L(3) /* yes => return NULL */ + + incl %eax /* increment pointer */ + + /* The test four the fourth byte is necessary! */ + cmpb %dl, %ch /* fourth byte == C? */ + je L(out) /* yes => return pointer */ + +L(3): xorl %eax, %eax + jmp L(out) +END (strchr) + +#undef index +weak_alias (strchr, index) +libc_hidden_builtin_def (strchr) diff --git a/REORG.TODO/sysdeps/i386/i586/strcpy.S b/REORG.TODO/sysdeps/i386/i586/strcpy.S new file mode 100644 index 0000000000..a444604f4f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/strcpy.S @@ -0,0 +1,169 @@ +/* strcpy/stpcpy implementation for i586. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+12 /* space for 3 saved regs */ +#define RTN PARMS +#define DEST RTN +#define SRC DEST+4 + +#ifndef USE_AS_STPCPY +# define STRCPY strcpy +#endif + +#define magic 0xfefefeff + + .text +ENTRY (STRCPY) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl DEST(%esp), %edi + cfi_rel_offset (edi, 8) + movl SRC(%esp), %esi + cfi_rel_offset (esi, 4) + + xorl %eax, %eax + leal -1(%esi), %ecx + + movl $magic, %ebx + cfi_rel_offset (ebx, 0) + andl $3, %ecx + +#ifdef PIC + call 2f + cfi_adjust_cfa_offset (4) +2: popl %edx + cfi_adjust_cfa_offset (-4) + /* 0xb is the distance between 2: and 1: but we avoid writing + 1f-2b because the assembler generates worse code. */ + leal 0xb(%edx,%ecx,8), %ecx +#else + leal 1f(,%ecx,8), %ecx +#endif + + jmp *%ecx + + .align 8 +1: + orb (%esi), %al + jz L(end) + stosb + xorl %eax, %eax + incl %esi + + orb (%esi), %al + jz L(end) + stosb + xorl %eax, %eax + incl %esi + + orb (%esi), %al + jz L(end) + stosb + xorl %eax, %eax + incl %esi + +L(1): movl (%esi), %ecx + leal 4(%esi),%esi + + subl %ecx, %eax + addl %ebx, %ecx + + decl %eax + jnc L(3) + + movl %ecx, %edx + xorl %ecx, %eax + + subl %ebx, %edx + andl $~magic, %eax + + jne L(4) + + movl %edx, (%edi) + leal 4(%edi),%edi + + jmp L(1) + +L(3): movl %ecx, %edx + + subl %ebx, %edx + +L(4): movb %dl, (%edi) + testb %dl, %dl + + movl %edx, %eax + jz L(end2) + + shrl $16, %eax + movb %dh, 1(%edi) +#ifdef USE_AS_STPCPY + addl $1, %edi +#endif + + cmpb $0, %dh + jz L(end2) + +#ifdef USE_AS_STPCPY + movb %al, 1(%edi) + addl $1, %edi + + cmpb $0, %al + jz L(end2) + + addl $1, %edi +#else + movb %al, 2(%edi) + testb %al, %al + + leal 3(%edi), %edi + jz L(end2) +#endif + +L(end): movb %ah, (%edi) + +L(end2): +#ifdef USE_AS_STPCPY + movl %edi, %eax +#else + movl DEST(%esp), %eax +#endif + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (STRCPY) +#ifndef USE_AS_STPCPY +libc_hidden_builtin_def (strcpy) +#endif diff --git a/REORG.TODO/sysdeps/i386/i586/strlen.S b/REORG.TODO/sysdeps/i386/i586/strlen.S new file mode 100644 index 0000000000..cfea2e020f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/strlen.S @@ -0,0 +1,182 @@ +/* strlen -- Compute length of NUL terminated string. + Highly optimized version for ix86, x>=5. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to execute some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +#define PARMS 4 /* no space for saved regs */ +#define STR PARMS + + .text +ENTRY (strlen) + + movl STR(%esp), %eax + movl $3, %edx /* load mask (= 3) */ + + andl %eax, %edx /* separate last two bits of address */ + + jz L(1) /* aligned => start loop */ + jp L(0) /* exactly two bits set */ + + cmpb %dh, (%eax) /* is byte NUL? */ + je L(2) /* yes => return */ + + incl %eax /* increment pointer */ + cmpb %dh, (%eax) /* is byte NUL? */ + + je L(2) /* yes => return */ + + incl %eax /* increment pointer */ + xorl $2, %edx + + jz L(1) + +L(0): cmpb %dh, (%eax) /* is byte NUL? */ + je L(2) /* yes => return */ + + incl %eax /* increment pointer */ + xorl %edx, %edx /* We need %edx == 0 for later */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + Note: %edx == 0 in any case here. */ + +L(1): + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L(3) /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L(3) /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L(3) /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L(3) /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + je L(1) /* no => start loop again */ + + +L(3): subl $4, %eax /* correct too early pointer increment */ + subl $magic, %ecx + + cmpb $0, %cl /* lowest byte NUL? */ + jz L(2) /* yes => return */ + + inc %eax /* increment pointer */ + testb %ch, %ch /* second byte NUL? */ + + jz L(2) /* yes => return */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb $0, %cl /* is third byte NUL? */ + jz L(2) /* yes => return */ + + incl %eax /* increment pointer */ + +L(2): subl STR(%esp), %eax /* now compute the length as difference + between start and terminating NUL + character */ + ret +END (strlen) +libc_hidden_builtin_def (strlen) diff --git a/REORG.TODO/sysdeps/i386/i586/sub_n.S b/REORG.TODO/sysdeps/i386/i586/sub_n.S new file mode 100644 index 0000000000..21b5a2742c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/sub_n.S @@ -0,0 +1,143 @@ +/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0 + and store difference in a third limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define S2 S1+4 +#define SIZE S2+4 + + .text +ENTRY (__mpn_sub_n) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 12) + movl S1(%esp),%esi + cfi_rel_offset (esi, 8) + movl S2(%esp),%ebx + cfi_rel_offset (ebx, 0) + movl SIZE(%esp),%ecx + movl (%ebx),%ebp + cfi_rel_offset (ebp, 4) + + decl %ecx + movl %ecx,%edx + shrl $3,%ecx + andl $7,%edx + testl %ecx,%ecx /* zero carry flag */ + jz L(end) + pushl %edx + cfi_adjust_cfa_offset (4) + + ALIGN (3) +L(oop): movl 28(%edi),%eax /* fetch destination cache line */ + leal 32(%edi),%edi + +L(1): movl (%esi),%eax + movl 4(%esi),%edx + sbbl %ebp,%eax + movl 4(%ebx),%ebp + sbbl %ebp,%edx + movl 8(%ebx),%ebp + movl %eax,-32(%edi) + movl %edx,-28(%edi) + +L(2): movl 8(%esi),%eax + movl 12(%esi),%edx + sbbl %ebp,%eax + movl 12(%ebx),%ebp + sbbl %ebp,%edx + movl 16(%ebx),%ebp + movl %eax,-24(%edi) + movl %edx,-20(%edi) + +L(3): movl 16(%esi),%eax + movl 20(%esi),%edx + sbbl %ebp,%eax + movl 20(%ebx),%ebp + sbbl %ebp,%edx + movl 24(%ebx),%ebp + movl %eax,-16(%edi) + movl %edx,-12(%edi) + +L(4): movl 24(%esi),%eax + movl 28(%esi),%edx + sbbl %ebp,%eax + movl 28(%ebx),%ebp + sbbl %ebp,%edx + movl 32(%ebx),%ebp + movl %eax,-8(%edi) + movl %edx,-4(%edi) + + leal 32(%esi),%esi + leal 32(%ebx),%ebx + decl %ecx + jnz L(oop) + + popl %edx + cfi_adjust_cfa_offset (-4) +L(end): + decl %edx /* test %edx w/o clobbering carry */ + js L(end2) + incl %edx +L(oop2): + leal 4(%edi),%edi + movl (%esi),%eax + sbbl %ebp,%eax + movl 4(%ebx),%ebp + movl %eax,-4(%edi) + leal 4(%esi),%esi + leal 4(%ebx),%ebx + decl %edx + jnz L(oop2) +L(end2): + movl (%esi),%eax + sbbl %ebp,%eax + movl %eax,(%edi) + + sbbl %eax,%eax + negl %eax + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_sub_n) diff --git a/REORG.TODO/sysdeps/i386/i586/submul_1.S b/REORG.TODO/sysdeps/i386/i586/submul_1.S new file mode 100644 index 0000000000..5e5e121ca2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i586/submul_1.S @@ -0,0 +1,94 @@ +/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract + the result from a second limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define SIZE S1+4 +#define S2LIMB SIZE+4 + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebx + + .text +ENTRY (__mpn_submul_1) + + pushl %res_ptr + cfi_adjust_cfa_offset (4) + pushl %s1_ptr + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %s2_limb + cfi_adjust_cfa_offset (4) + + movl RES(%esp), %res_ptr + cfi_rel_offset (res_ptr, 12) + movl S1(%esp), %s1_ptr + cfi_rel_offset (s1_ptr, 8) + movl SIZE(%esp), %size + movl S2LIMB(%esp), %s2_limb + cfi_rel_offset (s2_limb, 0) + leal (%res_ptr,%size,4), %res_ptr + leal (%s1_ptr,%size,4), %s1_ptr + negl %size + xorl %ebp, %ebp + cfi_rel_offset (ebp, 4) + ALIGN (3) + +L(oop): adcl $0, %ebp + movl (%s1_ptr,%size,4), %eax + + mull %s2_limb + + addl %ebp, %eax + movl (%res_ptr,%size,4), %ebp + + adcl $0, %edx + subl %eax, %ebp + + movl %ebp, (%res_ptr,%size,4) + incl %size + + movl %edx, %ebp + jnz L(oop) + + adcl $0, %ebp + movl %ebp, %eax + popl %s2_limb + cfi_adjust_cfa_offset (-4) + cfi_restore (s2_limb) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %s1_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (s1_ptr) + popl %res_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (res_ptr) + + ret +#undef size +END (__mpn_submul_1) diff --git a/REORG.TODO/sysdeps/i386/i686/Makefile b/REORG.TODO/sysdeps/i386/i686/Makefile new file mode 100644 index 0000000000..311042787b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/Makefile @@ -0,0 +1,12 @@ +# So that we can test __m128's alignment +stack-align-test-flags += -msse + +CFLAGS-.o += -Wa,-mtune=i686 +CFLAGS-.os += -Wa,-mtune=i686 +CFLAGS-.op += -Wa,-mtune=i686 +CFLAGS-.oS += -Wa,-mtune=i686 + +ASFLAGS-.o += -Wa,-mtune=i686 +ASFLAGS-.os += -Wa,-mtune=i686 +ASFLAGS-.op += -Wa,-mtune=i686 +ASFLAGS-.oS += -Wa,-mtune=i686 diff --git a/REORG.TODO/sysdeps/i386/i686/add_n.S b/REORG.TODO/sysdeps/i386/i686/add_n.S new file mode 100644 index 0000000000..4afa648ceb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/add_n.S @@ -0,0 +1,110 @@ +/* Add two limb vectors of the same length > 0 and store sum in a third + limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+8 /* space for 2 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define S2 S1+4 +#define SIZE S2+4 + + .text +#ifdef PIC +L(1): addl (%esp), %eax + ret +#endif +ENTRY (__mpn_add_n) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 4) + movl S1(%esp),%esi + cfi_rel_offset (esi, 0) + movl S2(%esp),%edx + movl SIZE(%esp),%ecx + movl %ecx,%eax + shrl $3,%ecx /* compute count for unrolled loop */ + negl %eax + andl $7,%eax /* get index where to start loop */ + jz L(oop) /* necessary special case for 0 */ + incl %ecx /* adjust loop count */ + shll $2,%eax /* adjustment for pointers... */ + subl %eax,%edi /* ... since they are offset ... */ + subl %eax,%esi /* ... by a constant when we ... */ + subl %eax,%edx /* ... enter the loop */ + shrl $2,%eax /* restore previous value */ +#ifdef PIC +/* Calculate start address in loop for PIC. */ + leal (L(oop)-L(0)-3)(%eax,%eax,8),%eax + call L(1) +L(0): +#else +/* Calculate start address in loop for non-PIC. */ + leal (L(oop) - 3)(%eax,%eax,8),%eax +#endif + jmp *%eax /* jump into loop */ + ALIGN (3) +L(oop): movl (%esi),%eax + adcl (%edx),%eax + movl %eax,(%edi) + movl 4(%esi),%eax + adcl 4(%edx),%eax + movl %eax,4(%edi) + movl 8(%esi),%eax + adcl 8(%edx),%eax + movl %eax,8(%edi) + movl 12(%esi),%eax + adcl 12(%edx),%eax + movl %eax,12(%edi) + movl 16(%esi),%eax + adcl 16(%edx),%eax + movl %eax,16(%edi) + movl 20(%esi),%eax + adcl 20(%edx),%eax + movl %eax,20(%edi) + movl 24(%esi),%eax + adcl 24(%edx),%eax + movl %eax,24(%edi) + movl 28(%esi),%eax + adcl 28(%edx),%eax + movl %eax,28(%edi) + leal 32(%edi),%edi + leal 32(%esi),%esi + leal 32(%edx),%edx + decl %ecx + jnz L(oop) + + sbbl %eax,%eax + negl %eax + + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_add_n) diff --git a/REORG.TODO/sysdeps/i386/i686/bcopy.S b/REORG.TODO/sysdeps/i386/i686/bcopy.S new file mode 100644 index 0000000000..15ef9419a4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/bcopy.S @@ -0,0 +1,3 @@ +#define USE_AS_BCOPY +#define memmove bcopy +#include <sysdeps/i386/i686/memmove.S> diff --git a/REORG.TODO/sysdeps/i386/i686/bzero.S b/REORG.TODO/sysdeps/i386/i686/bzero.S new file mode 100644 index 0000000000..c7898f18e0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/bzero.S @@ -0,0 +1,4 @@ +#define USE_AS_BZERO +#define memset __bzero +#include <sysdeps/i386/i686/memset.S> +weak_alias (__bzero, bzero) diff --git a/REORG.TODO/sysdeps/i386/i686/dl-hash.h b/REORG.TODO/sysdeps/i386/i686/dl-hash.h new file mode 100644 index 0000000000..ceda785b32 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/dl-hash.h @@ -0,0 +1,79 @@ +/* Compute hash alue for given string according to ELF standard. + Copyright (C) 1998-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _DL_HASH_H +#define _DL_HASH_H 1 + + +/* This is the hashing function specified by the ELF ABI. It is highly + optimized for the PII processors. Though it will run on i586 it + would be much slower than the generic C implementation. So don't + use it. */ +static unsigned int +__attribute__ ((unused)) +_dl_elf_hash (const char *name) +{ + unsigned int result; + unsigned int temp0; + unsigned int temp1; + + __asm__ __volatile__ + ("movzbl (%1),%2\n\t" + "testl %2, %2\n\t" + "jz 1f\n\t" + "movl %2, %0\n\t" + "movzbl 1(%1), %2\n\t" + "jecxz 1f\n\t" + "shll $4, %0\n\t" + "addl %2, %0\n\t" + "movzbl 2(%1), %2\n\t" + "jecxz 1f\n\t" + "shll $4, %0\n\t" + "addl %2, %0\n\t" + "movzbl 3(%1), %2\n\t" + "jecxz 1f\n\t" + "shll $4, %0\n\t" + "addl %2, %0\n\t" + "movzbl 4(%1), %2\n\t" + "jecxz 1f\n\t" + "shll $4, %0\n\t" + "addl $5, %1\n\t" + "addl %2, %0\n\t" + "movzbl (%1), %2\n\t" + "jecxz 1f\n" + "2:\t" + "shll $4, %0\n\t" + "movl $0xf0000000, %3\n\t" + "incl %1\n\t" + "addl %2, %0\n\t" + "andl %0, %3\n\t" + "andl $0x0fffffff, %0\n\t" + "shrl $24, %3\n\t" + "movzbl (%1), %2\n\t" + "xorl %3, %0\n\t" + "testl %2, %2\n\t" + "jnz 2b\n" + "1:\t" + : "=&r" (result), "=r" (name), "=&c" (temp0), "=&r" (temp1) + : "0" (0), "1" ((const unsigned char *) name)); + + return result; +} + +#endif /* dl-hash.h */ diff --git a/REORG.TODO/sysdeps/i386/i686/ffs.c b/REORG.TODO/sysdeps/i386/i686/ffs.c new file mode 100644 index 0000000000..cbe36ff873 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/ffs.c @@ -0,0 +1,48 @@ +/* ffs -- find first set bit in a word, counted from least significant end. + For Intel 80x86, x>=6. + This file is part of the GNU C Library. + Copyright (C) 1991-2017 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@cygnus.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define ffsl __something_else +#include <string.h> + +#undef ffs + +#ifdef __GNUC__ + +int +__ffs (int x) +{ + int cnt; + int tmp; + + asm ("bsfl %2,%0\n" /* Count low bits in X and store in %1. */ + "cmovel %1,%0\n" /* If number was zero, use -1 as result. */ + : "=&r" (cnt), "=r" (tmp) : "rm" (x), "1" (-1)); + + return cnt + 1; +} +weak_alias (__ffs, ffs) +libc_hidden_def (__ffs) +libc_hidden_builtin_def (ffs) +#undef ffsl +weak_alias (__ffs, ffsl) + +#else +#include <string/ffs.c> +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S new file mode 100644 index 0000000000..73060b088c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_log.S @@ -0,0 +1,29 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for i686 instructions. + */ + +#include <machine/asm.h> + + + .text +ENTRY(__ieee754_log) + fldln2 // log(2) + fldl 4(%esp) // x : log(2) + fucomi %st + jp 3f + fyl2x // log(x) + ret + +3: fstp %st(1) + ret +END (__ieee754_log) + +ENTRY(__log_finite) + fldln2 // log(2) + fldl 4(%esp) // x : log(2) + fyl2x // log(x) + ret +END(__log_finite) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S new file mode 100644 index 0000000000..6fd39d50d3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_logf.S @@ -0,0 +1,30 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * Adapted for float by Ulrich Drepper <drepper@cygnus.com>. + * + * Adapted for i686 instructions. + */ + +#include <machine/asm.h> + + + .text +ENTRY(__ieee754_logf) + fldln2 // log(2) + flds 4(%esp) // x : log(2) + fucomi %st + jp 3f + fyl2x // log(x) + ret + +3: fstp %st(1) + ret +END (__ieee754_logf) + +ENTRY(__logf_finite) + fldln2 // log(2) + flds 4(%esp) // x : log(2) + fyl2x // log(x) + ret +END(__logf_finite) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S b/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S new file mode 100644 index 0000000000..7e3bc8d817 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/e_logl.S @@ -0,0 +1,94 @@ +/* + * Written by J.T. Conklin <jtc@netbsd.org>. + * Public domain. + * + * Adapted for `long double' by Ulrich Drepper <drepper@cygnus.com>. + * Changed to use fyl2xp1 for values near 1, <drepper@cygnus.com>. + * Adapted for i686 instructions. + */ + +#include <machine/asm.h> + + .section .rodata.cst8,"aM",@progbits,8 + + .p2align 3 + .type one,@object +one: .double 1.0 + ASM_SIZE_DIRECTIVE(one) + /* It is not important that this constant is precise. It is only + a value which is known to be on the safe side for using the + fyl2xp1 instruction. */ + .type limit,@object +limit: .double 0.29 + ASM_SIZE_DIRECTIVE(limit) + + +#ifdef PIC +# define MO(op) op##@GOTOFF(%edx) +#else +# define MO(op) op +#endif + + .text +ENTRY(__ieee754_logl) + fldln2 // log(2) + fldt 4(%esp) // x : log(2) + fucomi %st + jp 3f +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + movzwl 4+8(%esp), %eax + cmpl $0xc000, %eax + jae 5f // x <= -2, avoid overflow from -LDBL_MAX - 1. + fsubl MO(one) // x-1 : x : log(2) +5: fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fld MO(limit) // 0.29 : |x-1| : x-1 : x : log(2) + fcomip %st(1) // |x-1| : x-1 : x : log(2) + fstp %st(0) // x-1 : x : log(2) + jc 2f + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 4f + fabs // log(1) is +0 in all rounding modes. +4: fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret + +2: fstp %st(0) // x : log(2) + fyl2x // log(x) + ret + +3: fstp %st(1) + fadd %st(0) + ret +END (__ieee754_logl) + +ENTRY(__logl_finite) + fldln2 // log(2) + fldt 4(%esp) // x : log(2) +#ifdef PIC + LOAD_PIC_REG (dx) +#endif + fld %st // x : x : log(2) + fsubl MO(one) // x-1 : x : log(2) + fld %st // x-1 : x-1 : x : log(2) + fabs // |x-1| : x-1 : x : log(2) + fld MO(limit) // 0.29 : |x-1| : x-1 : x : log(2) + fcomip %st(1) // |x-1| : x-1 : x : log(2) + fstp %st(0) // x-1 : x : log(2) + jc 2b + fxam + fnstsw + andb $0x45, %ah + cmpb $0x40, %ah + jne 6f + fabs // log(1) is +0 in all rounding modes. +6: fstp %st(1) // x-1 : log(2) + fyl2xp1 // log(x) + ret +END(__logl_finite) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile new file mode 100644 index 0000000000..7d9089232f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/Makefile @@ -0,0 +1,4 @@ +ifeq ($(subdir),math) +libm-sysdep_routines += e_expf-sse2 e_expf-ia32 s_sinf-sse2 s_cosf-sse2 \ + s_sincosf-sse2 +endif diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S new file mode 100644 index 0000000000..b486b4d1ca --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-ia32.S @@ -0,0 +1,22 @@ +/* + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define __ieee754_expf __ieee754_expf_ia32 +#define __expf_finite __expf_finite_ia32 + +#include <sysdeps/i386/fpu/e_expf.S> diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S new file mode 100644 index 0000000000..e6bb6fa289 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf-sse2.S @@ -0,0 +1,325 @@ +/* SSE2 version of __ieee754_expf and __expf_finite + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#include <sysdep.h> + +/* Short algorithm description: + * + * Let K = 64 (table size). + * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y)) + * where + * x = m*log(2)/K + y, y in [0.0..log(2)/K] + * m = n*K + j, m,n,j - signed integer, j in [0..K-1] + * values of 2^(j/K) are tabulated as T[j]. + * + * P(y) is a minimax polynomial approximation of expf(x)-1 + * on small interval [0.0..log(2)/K]. + * + * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as + * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y + * + * Special cases: + * __ieee754_expf_sse2(NaN) = NaN + * __ieee754_expf_sse2(+INF) = +INF + * __ieee754_expf_sse2(-INF) = 0 + * __ieee754_expf_sse2(x) = 1 for subnormals + * for finite argument, only __ieee754_expf_sse2(0)=1 is exact + * __ieee754_expf_sse2(x) overflows if x>700 + * __ieee754_expf_sse2(x) underflows if x<-700 + * + * Note: + * For |x|<700, __ieee754_expf_sse2 computes result in double precision, + * with accuracy a bit more than needed for expf, and does not round it + * to single precision. + */ + + +#ifdef PIC +# define MO1(symbol) L(symbol)##@GOTOFF(%edx) +# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%edx,reg2,_scale) +#else +# define MO1(symbol) L(symbol) +# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale) +#endif + + .text +ENTRY(__ieee754_expf_sse2) + /* Input: single precision x on stack at address 4(%esp) */ + +#ifdef PIC + LOAD_PIC_REG(dx) +#endif + + cvtss2sd 4(%esp), %xmm1 /* Convert x to double precision */ + mov 4(%esp), %ecx /* Copy x */ + movsd MO1(DP_KLN2), %xmm2 /* DP K/log(2) */ + movsd MO1(DP_P2), %xmm3 /* DP P2 */ + movl %ecx, %eax /* x */ + mulsd %xmm1, %xmm2 /* DP x*K/log(2) */ + andl $0x7fffffff, %ecx /* |x| */ + cmpl $0x442f0000, %ecx /* |x|<700 ? */ + movsd MO1(DP_P3), %xmm4 /* DP P3 */ + addsd MO1(DP_RS), %xmm2 /* DP x*K/log(2)+RS */ + jae L(special_paths) + + /* Here if |x|<700 */ + cmpl $0x31800000, %ecx /* |x|<2^(-28) ? */ + jb L(small_arg) + + /* Main path: here if 2^(-28)<=|x|<700 */ + cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */ + movd %xmm2, %eax /* bits of n*K+j with trash */ + subss MO1(SP_RS), %xmm2 /* SP t=round(x*K/log(2)) */ + movl %eax, %ecx /* n*K+j with trash */ + cvtss2sd %xmm2, %xmm2 /* DP t */ + andl $0x3f, %eax /* bits of j */ + mulsd MO1(DP_NLN2K), %xmm2 /* DP -t*log(2)/K */ + andl $0xffffffc0, %ecx /* bits of n */ +#ifdef __AVX__ + vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */ + vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */ +#else + addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */ + movaps %xmm2, %xmm0 /* DP y */ + mulsd %xmm2, %xmm2 /* DP z=y*y */ +#endif + mulsd %xmm2, %xmm4 /* DP P3*z */ + addl $0xffc0, %ecx /* bits of n + DP exponent bias */ + mulsd %xmm2, %xmm3 /* DP P2*z */ + shrl $2, %ecx /* High 2 bytes of DP 2^n */ + pxor %xmm1, %xmm1 /* clear %xmm1 */ + addsd MO1(DP_P1), %xmm4 /* DP P3*z+P1 */ + addsd MO1(DP_P0), %xmm3 /* DP P2*z+P0 */ + pinsrw $3, %ecx, %xmm1 /* DP 2^n */ + mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */ + mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */ + addsd %xmm4, %xmm0 /* DP P(y) */ + mulsd MO2(DP_T,%eax,8), %xmm0 /* DP P(y)*T[j] */ + addsd MO2(DP_T,%eax,8), %xmm0 /* DP T[j]*(P(y)+1) */ + mulsd %xmm1, %xmm0 /* DP result=2^n*(T[j]*(P(y)+1)) */ + cvtsd2ss %xmm0, %xmm1 + + lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */ + movss %xmm1, 0(%esp) /* Move result from sse... */ + flds 0(%esp) /* ...to FPU. */ + lea 4(%esp), %esp /* Return back 4 bytes of stack frame */ + ret + + .p2align 4 +L(small_arg): + /* Here if 0<=|x|<2^(-28) */ + movss 4(%esp), %xmm0 /* load x */ + addss MO1(SP_ONE), %xmm0 /* 1.0 + x */ + /* Return 1.0 with inexact raised, except for x==0 */ + jmp L(epilogue) + + .p2align 4 +L(special_paths): + /* Here if x is NaN, or Inf, or finite |x|>=700 */ + movss 4(%esp), %xmm0 /* load x */ + + cmpl $0x7f800000, %ecx /* |x| is finite ? */ + jae L(arg_inf_or_nan) + + /* Here if finite |x|>=700 */ + testl $0x80000000, %eax /* sign of x nonzero ? */ + je L(res_overflow) + + /* Here if finite x<=-700 */ + movss MO1(SP_SMALL), %xmm0 /* load small value 2^(-100) */ + mulss %xmm0, %xmm0 /* Return underflowed result (zero or subnormal) */ + jmp L(epilogue) + + .p2align 4 +L(res_overflow): + /* Here if finite x>=700 */ + movss MO1(SP_LARGE), %xmm0 /* load large value 2^100 */ + mulss %xmm0, %xmm0 /* Return overflowed result (Inf or max normal) */ + jmp L(epilogue) + + .p2align 4 +L(arg_inf_or_nan): + /* Here if |x| is Inf or NAN */ + jne L(arg_nan) /* |x| is Inf ? */ + + /* Here if |x| is Inf */ + shrl $31, %eax /* Get sign bit of x */ + movss MO2(SP_INF_0,%eax,4), %xmm0/* return zero or Inf, depending on sign of x */ + jmp L(epilogue) + + .p2align 4 +L(arg_nan): + /* Here if |x| is NaN */ + addss %xmm0, %xmm0 /* Return x+x (raise invalid) */ + + .p2align 4 +L(epilogue): + lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */ + movss %xmm0, 0(%esp) /* Move result from sse... */ + flds 0(%esp) /* ...to FPU. */ + lea 4(%esp), %esp /* Return back 4 bytes of stack frame */ + ret +END(__ieee754_expf_sse2) + + .section .rodata, "a" + .p2align 3 +L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */ + .long 0x00000000, 0x3ff00000 + .long 0x3e778061, 0x3ff02c9a + .long 0xd3158574, 0x3ff059b0 + .long 0x18759bc8, 0x3ff08745 + .long 0x6cf9890f, 0x3ff0b558 + .long 0x32d3d1a2, 0x3ff0e3ec + .long 0xd0125b51, 0x3ff11301 + .long 0xaea92de0, 0x3ff1429a + .long 0x3c7d517b, 0x3ff172b8 + .long 0xeb6fcb75, 0x3ff1a35b + .long 0x3168b9aa, 0x3ff1d487 + .long 0x88628cd6, 0x3ff2063b + .long 0x6e756238, 0x3ff2387a + .long 0x65e27cdd, 0x3ff26b45 + .long 0xf51fdee1, 0x3ff29e9d + .long 0xa6e4030b, 0x3ff2d285 + .long 0x0a31b715, 0x3ff306fe + .long 0xb26416ff, 0x3ff33c08 + .long 0x373aa9cb, 0x3ff371a7 + .long 0x34e59ff7, 0x3ff3a7db + .long 0x4c123422, 0x3ff3dea6 + .long 0x21f72e2a, 0x3ff4160a + .long 0x6061892d, 0x3ff44e08 + .long 0xb5c13cd0, 0x3ff486a2 + .long 0xd5362a27, 0x3ff4bfda + .long 0x769d2ca7, 0x3ff4f9b2 + .long 0x569d4f82, 0x3ff5342b + .long 0x36b527da, 0x3ff56f47 + .long 0xdd485429, 0x3ff5ab07 + .long 0x15ad2148, 0x3ff5e76f + .long 0xb03a5585, 0x3ff6247e + .long 0x82552225, 0x3ff66238 + .long 0x667f3bcd, 0x3ff6a09e + .long 0x3c651a2f, 0x3ff6dfb2 + .long 0xe8ec5f74, 0x3ff71f75 + .long 0x564267c9, 0x3ff75feb + .long 0x73eb0187, 0x3ff7a114 + .long 0x36cf4e62, 0x3ff7e2f3 + .long 0x994cce13, 0x3ff82589 + .long 0x9b4492ed, 0x3ff868d9 + .long 0x422aa0db, 0x3ff8ace5 + .long 0x99157736, 0x3ff8f1ae + .long 0xb0cdc5e5, 0x3ff93737 + .long 0x9fde4e50, 0x3ff97d82 + .long 0x82a3f090, 0x3ff9c491 + .long 0x7b5de565, 0x3ffa0c66 + .long 0xb23e255d, 0x3ffa5503 + .long 0x5579fdbf, 0x3ffa9e6b + .long 0x995ad3ad, 0x3ffae89f + .long 0xb84f15fb, 0x3ffb33a2 + .long 0xf2fb5e47, 0x3ffb7f76 + .long 0x904bc1d2, 0x3ffbcc1e + .long 0xdd85529c, 0x3ffc199b + .long 0x2e57d14b, 0x3ffc67f1 + .long 0xdcef9069, 0x3ffcb720 + .long 0x4a07897c, 0x3ffd072d + .long 0xdcfba487, 0x3ffd5818 + .long 0x03db3285, 0x3ffda9e6 + .long 0x337b9b5f, 0x3ffdfc97 + .long 0xe78b3ff6, 0x3ffe502e + .long 0xa2a490da, 0x3ffea4af + .long 0xee615a27, 0x3ffefa1b + .long 0x5b6e4540, 0x3fff5076 + .long 0x819e90d8, 0x3fffa7c1 + .type L(DP_T), @object + ASM_SIZE_DIRECTIVE(L(DP_T)) + + .section .rodata.cst8,"aM",@progbits,8 + .p2align 3 +L(DP_KLN2): /* double precision K/log(2) */ + .long 0x652b82fe, 0x40571547 + .type L(DP_KLN2), @object + ASM_SIZE_DIRECTIVE(L(DP_KLN2)) + + .p2align 3 +L(DP_NLN2K): /* double precision -log(2)/K */ + .long 0xfefa39ef, 0xbf862e42 + .type L(DP_NLN2K), @object + ASM_SIZE_DIRECTIVE(L(DP_NLN2K)) + + .p2align 3 +L(DP_RS): /* double precision 2^23+2^22 */ + .long 0x00000000, 0x41680000 + .type L(DP_RS), @object + ASM_SIZE_DIRECTIVE(L(DP_RS)) + + .p2align 3 +L(DP_P3): /* double precision polynomial coefficient P3 */ + .long 0xeb78fa85, 0x3fa56420 + .type L(DP_P3), @object + ASM_SIZE_DIRECTIVE(L(DP_P3)) + + .p2align 3 +L(DP_P1): /* double precision polynomial coefficient P1 */ + .long 0x008d6118, 0x3fe00000 + .type L(DP_P1), @object + ASM_SIZE_DIRECTIVE(L(DP_P1)) + + .p2align 3 +L(DP_P2): /* double precision polynomial coefficient P2 */ + .long 0xda752d4f, 0x3fc55550 + .type L(DP_P2), @object + ASM_SIZE_DIRECTIVE(L(DP_P2)) + + .p2align 3 +L(DP_P0): /* double precision polynomial coefficient P0 */ + .long 0xffffe7c6, 0x3fefffff + .type L(DP_P0), @object + ASM_SIZE_DIRECTIVE(L(DP_P0)) + + .p2align 2 +L(SP_INF_0): + .long 0x7f800000 /* single precision Inf */ + .long 0 /* single precision zero */ + .type L(SP_INF_0), @object + ASM_SIZE_DIRECTIVE(L(SP_INF_0)) + + .section .rodata.cst4,"aM",@progbits,4 + .p2align 2 +L(SP_RS): /* single precision 2^23+2^22 */ + .long 0x4b400000 + .type L(SP_RS), @object + ASM_SIZE_DIRECTIVE(L(SP_RS)) + + .p2align 2 +L(SP_SMALL): /* single precision small value 2^(-100) */ + .long 0x0d800000 + .type L(SP_SMALL), @object + ASM_SIZE_DIRECTIVE(L(SP_SMALL)) + + .p2align 2 +L(SP_LARGE): /* single precision large value 2^100 */ + .long 0x71800000 + .type L(SP_LARGE), @object + ASM_SIZE_DIRECTIVE(L(SP_LARGE)) + + .p2align 2 +L(SP_ONE): /* single precision 1.0 */ + .long 0x3f800000 + .type L(SP_ONE), @object + ASM_SIZE_DIRECTIVE(L(SP_ONE)) + +strong_alias (__ieee754_expf_sse2, __expf_finite_sse2) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c new file mode 100644 index 0000000000..388cf98a39 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/e_expf.c @@ -0,0 +1,37 @@ +/* Multiple versions of expf + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern double __ieee754_expf_sse2 (double); +extern double __ieee754_expf_ia32 (double); + +double __ieee754_expf (double); +libm_ifunc (__ieee754_expf, + HAS_CPU_FEATURE (SSE2) + ? __ieee754_expf_sse2 + : __ieee754_expf_ia32); + +extern double __expf_finite_sse2 (double); +extern double __expf_finite_ia32 (double); + +double __expf_finite (double); +libm_ifunc (__expf_finite, + HAS_CPU_FEATURE (SSE2) + ? __expf_finite_sse2 + : __expf_finite_ia32); diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps new file mode 100644 index 0000000000..04bc23b37b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps @@ -0,0 +1,2188 @@ +# Begin of automatic generation + +# Maximal error of functions: +Function: "acos": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "acos_downward": +ildouble: 2 +ldouble: 2 + +Function: "acos_towardzero": +ildouble: 2 +ldouble: 2 + +Function: "acos_upward": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "acosh": +double: 1 +idouble: 1 +ildouble: 4 +ldouble: 2 + +Function: "acosh_downward": +double: 1 +idouble: 1 +ildouble: 6 +ldouble: 4 + +Function: "acosh_towardzero": +double: 1 +idouble: 1 +ildouble: 6 +ldouble: 4 + +Function: "acosh_upward": +double: 1 +idouble: 1 +ildouble: 4 +ldouble: 3 + +Function: "asin": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "asin_downward": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "asin_towardzero": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "asin_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "asinh": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "asinh_downward": +double: 1 +float: 1 +idouble: 1 +ildouble: 5 +ldouble: 5 + +Function: "asinh_towardzero": +double: 1 +float: 1 +idouble: 1 +ildouble: 4 +ldouble: 4 + +Function: "asinh_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: "atan": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan2": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan2_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan2_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan2_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atan_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "atanh": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "atanh_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 4 + +Function: "atanh_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 3 + +Function: "atanh_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: "cabs": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "cabs_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "cabs_towardzero": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "cabs_upward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cacos": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "cacos": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cacos_downward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacos_downward": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 6 +ldouble: 6 + +Function: Real part of "cacos_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacos_towardzero": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Real part of "cacos_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacos_upward": +double: 7 +float: 7 +idouble: 7 +ifloat: 7 +ildouble: 7 +ldouble: 7 + +Function: Real part of "cacosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cacosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cacosh_downward": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "cacosh_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "cacosh_towardzero": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "cacosh_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cacosh_upward": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "cacosh_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "carg": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "carg_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "carg_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "carg_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "casin": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "casin": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "casin_downward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "casin_downward": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 6 +ldouble: 6 + +Function: Real part of "casin_towardzero": +double: 3 +float: 1 +idouble: 3 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "casin_towardzero": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Real part of "casin_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "casin_upward": +double: 7 +float: 7 +idouble: 7 +ifloat: 7 +ildouble: 7 +ldouble: 7 + +Function: Real part of "casinh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "casinh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "casinh_downward": +double: 5 +float: 3 +idouble: 5 +ifloat: 3 +ildouble: 6 +ldouble: 6 + +Function: Imaginary part of "casinh_downward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "casinh_towardzero": +double: 4 +float: 3 +idouble: 4 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "casinh_towardzero": +double: 3 +float: 1 +idouble: 3 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "casinh_upward": +double: 7 +float: 7 +idouble: 7 +ifloat: 7 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "casinh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "catan": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catan_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan_downward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "catan_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "catan_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catan_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "catanh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "catanh": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catanh_downward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "catanh_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catanh_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "catanh_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "catanh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "catanh_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "cbrt": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "cbrt_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "cbrt_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "cbrt_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ccos": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "ccos": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "ccos_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccos_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccos_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccos_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccos_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "ccos_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "ccosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "ccosh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "ccosh_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccosh_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccosh_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ccosh_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ccosh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "ccosh_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cexp": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "cexp": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cexp_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "cexp_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "cexp_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "cexp_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "cexp_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cexp_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "clog": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "clog10": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "clog10": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "clog10_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 8 +ldouble: 8 + +Function: Imaginary part of "clog10_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog10_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 8 +ldouble: 8 + +Function: Imaginary part of "clog10_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog10_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "clog10_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "clog_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "clog_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "clog_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "clog_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "clog_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "clog_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "cos": +ildouble: 1 +ldouble: 1 + +Function: "cos_downward": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "cos_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "cos_upward": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "cosh": +double: 1 +float: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "cosh_downward": +double: 2 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 3 + +Function: "cosh_towardzero": +double: 2 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "cosh_upward": +double: 4 +float: 2 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 3 + +Function: Real part of "cpow": +double: 2 +float: 5 +idouble: 2 +ifloat: 5 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "cpow": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Real part of "cpow_downward": +double: 5 +float: 8 +idouble: 5 +ifloat: 8 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "cpow_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "cpow_towardzero": +double: 5 +float: 8 +idouble: 5 +ifloat: 8 +ildouble: 7 +ldouble: 7 + +Function: Imaginary part of "cpow_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 1 +ldouble: 1 + +Function: Real part of "cpow_upward": +double: 4 +float: 1 +idouble: 4 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "cpow_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: Real part of "csin": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "csin": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 + +Function: Real part of "csin_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csin_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csin_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csin_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csin_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csin_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csinh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "csinh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "csinh_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csinh_downward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csinh_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csinh_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csinh_upward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "csinh_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Real part of "csqrt": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "csqrt": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "csqrt_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "csqrt_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "csqrt_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "csqrt_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "csqrt_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "csqrt_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctan": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Imaginary part of "ctan": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Real part of "ctan_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "ctan_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctan_towardzero": +double: 3 +float: 1 +idouble: 3 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: Imaginary part of "ctan_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctan_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ctan_upward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ctanh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: Imaginary part of "ctanh": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: Real part of "ctanh_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "ctanh_downward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Real part of "ctanh_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: Imaginary part of "ctanh_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Real part of "ctanh_upward": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: Imaginary part of "ctanh_upward": +double: 3 +float: 2 +idouble: 3 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "erf": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erf_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erf_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erf_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "erfc": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "erfc_downward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "erfc_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "erfc_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "exp": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp10": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp10_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp10_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp10_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp2": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp2_downward": +ildouble: 1 +ldouble: 1 + +Function: "exp2_towardzero": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp2_upward": +ildouble: 1 +ldouble: 1 + +Function: "exp_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "exp_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "exp_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "expm1": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "expm1_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "expm1_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "expm1_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "gamma": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "gamma_downward": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 7 +ldouble: 7 + +Function: "gamma_towardzero": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 7 +ldouble: 7 + +Function: "gamma_upward": +double: 3 +float: 4 +idouble: 3 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: "hypot": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "hypot_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "hypot_towardzero": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "hypot_upward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "j0": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "j0_downward": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "j0_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 5 +ldouble: 5 + +Function: "j0_upward": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "j1": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "j1_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 4 +ldouble: 4 + +Function: "j1_towardzero": +double: 2 +float: 1 +idouble: 2 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "j1_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 3 +ldouble: 3 + +Function: "jn": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "jn_downward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "jn_towardzero": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "jn_upward": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "lgamma": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "lgamma_downward": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 7 +ldouble: 7 + +Function: "lgamma_towardzero": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 7 +ldouble: 7 + +Function: "lgamma_upward": +double: 3 +float: 4 +idouble: 3 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: "log": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "log10": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "log10_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "log10_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "log10_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "log1p": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "log1p_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "log1p_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 4 + +Function: "log1p_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "log2": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "log2_downward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "log2_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "log2_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "log_downward": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "log_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "log_upward": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "pow": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "pow10": +double: 1 +idouble: 1 +ildouble: 1 +ldouble: 1 + +Function: "pow10_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "pow10_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "pow10_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "pow_downward": +double: 1 +idouble: 1 +ildouble: 4 +ldouble: 4 + +Function: "pow_towardzero": +double: 1 +idouble: 1 +ildouble: 4 +ldouble: 4 + +Function: "pow_upward": +double: 1 +idouble: 1 +ildouble: 4 +ldouble: 4 + +Function: "sin": +ildouble: 1 +ldouble: 1 + +Function: "sin_downward": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "sin_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "sin_upward": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "sincos": +ildouble: 1 +ldouble: 1 + +Function: "sincos_downward": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "sincos_towardzero": +double: 1 +idouble: 1 +ildouble: 2 +ldouble: 2 + +Function: "sincos_upward": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "sinh": +double: 1 +ildouble: 2 +ldouble: 2 + +Function: "sinh_downward": +double: 2 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 5 + +Function: "sinh_towardzero": +double: 2 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 4 + +Function: "sinh_upward": +double: 4 +float: 2 +idouble: 1 +ifloat: 1 +ildouble: 4 +ldouble: 5 + +Function: "tan": +float: 1 +ifloat: 1 +ildouble: 2 +ldouble: 2 + +Function: "tan_downward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "tan_towardzero": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "tan_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "tanh": +double: 1 +idouble: 1 +ildouble: 3 +ldouble: 3 + +Function: "tanh_downward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 7 +ldouble: 4 + +Function: "tanh_towardzero": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 3 +ldouble: 3 + +Function: "tanh_upward": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 5 +ldouble: 4 + +Function: "tgamma": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "tgamma_downward": +double: 3 +float: 4 +idouble: 3 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: "tgamma_towardzero": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: "tgamma_upward": +double: 4 +float: 4 +idouble: 4 +ifloat: 4 +ildouble: 5 +ldouble: 5 + +Function: "y0": +double: 1 +float: 1 +idouble: 1 +ifloat: 1 +ildouble: 1 +ldouble: 1 + +Function: "y0_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: "y0_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: "y0_upward": +double: 1 +float: 2 +idouble: 1 +ifloat: 2 +ildouble: 3 +ldouble: 3 + +Function: "y1": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 2 +ldouble: 2 + +Function: "y1_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 7 +ldouble: 7 + +Function: "y1_towardzero": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: "y1_upward": +double: 1 +float: 3 +idouble: 1 +ifloat: 3 +ildouble: 7 +ldouble: 7 + +Function: "yn": +double: 2 +float: 3 +idouble: 2 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +Function: "yn_downward": +double: 2 +float: 2 +idouble: 2 +ifloat: 2 +ildouble: 5 +ldouble: 5 + +Function: "yn_towardzero": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 5 +ldouble: 5 + +Function: "yn_upward": +double: 3 +float: 3 +idouble: 3 +ifloat: 3 +ildouble: 4 +ldouble: 4 + +# end of automatic generation diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name new file mode 100644 index 0000000000..193dd704b3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps-name @@ -0,0 +1 @@ +i686 diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S new file mode 100644 index 0000000000..f37850d0b3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf-sse2.S @@ -0,0 +1,553 @@ +/* Optimized with sse2 version of cosf + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define __need_Emath +#include <bits/errno.h> + +/* Short algorithm description: + * + * 1) if |x| == 0: return 1.0-|x|. + * 2) if |x| < 2^-27: return 1.0-|x|. + * 3) if |x| < 2^-5 : return 1.0+x^2*DP_COS2_0+x^5*DP_COS2_1. + * 4) if |x| < Pi/4: return 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). + * 5) if |x| < 9*Pi/4: + * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+3, + * t=|x|-j*Pi/4. + * 5.2) Reconstruction: + * s = (-1.0)^((n>>2)&1) + * if(n&2 != 0) { + * using cos(t) polynomial for |t|<Pi/4, result is + * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))). + * } else { + * using sin(t) polynomial for |t|<Pi/4, result is + * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))). + * } + * 6) if |x| < 2^23, large args: + * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3, + * t=|x|-j*Pi/4. + * 6.2) Reconstruction same as (5.2). + * 7) if |x| >= 2^23, very large args: + * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+3, + * t=|x|-j*Pi/4. + * 7.2) Reconstruction same as (5.2). + * 8) if x is Inf, return x-x, and set errno=EDOM. + * 9) if x is NaN, return x-x. + * + * Special cases: + * cos(+-0) = 1 not raising inexact, + * cos(subnormal) raises inexact, + * cos(min_normalized) raises inexact, + * cos(normalized) raises inexact, + * cos(Inf) = NaN, raises invalid, sets errno to EDOM, + * cos(NaN) = NaN. + */ + +#ifdef PIC +# define MO1(symbol) L(symbol)##@GOTOFF(%ebx) +# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%ebx,reg2,_scale) +# define CFI_PUSH(REG) cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0) +# define CFI_POP(REG) cfi_adjust_cfa_offset(-4); cfi_restore(REG) +# define PUSH(REG) pushl REG; CFI_PUSH(REG) +# define POP(REG) popl REG; CFI_POP(REG) +# define ENTRANCE PUSH(%ebx); LOAD_PIC_REG(bx) +# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx) +# define ARG_X 8(%esp) +#else +# define MO1(symbol) L(symbol) +# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale) +# define ENTRANCE +# define RETURN ret +# define ARG_X 4(%esp) +#endif + + .text +ENTRY(__cosf_sse2) + /* Input: single precision x on stack at address ARG_X */ + + ENTRANCE + movl ARG_X, %eax /* Bits of x */ + cvtss2sd ARG_X, %xmm0 /* DP x */ + andl $0x7fffffff, %eax /* |x| */ + + cmpl $0x3f490fdb, %eax /* |x|<Pi/4? */ + jb L(arg_less_pio4) + + /* Here if |x|>=Pi/4 */ + movd %eax, %xmm3 /* SP |x| */ + andpd MO1(DP_ABS_MASK),%xmm0 /* DP |x| */ + movss MO1(SP_INVPIO4), %xmm2 /* SP 1/(Pi/4) */ + + cmpl $0x40e231d6, %eax /* |x|<9*Pi/4? */ + jae L(large_args) + + /* Here if Pi/4<=|x|<9*Pi/4 */ + mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */ + cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */ + addl $1, %eax /* k+1 */ + movl $0x0e, %edx + andl %eax, %edx /* j = (k+1)&0x0e */ + addl $2, %eax /* n */ + subsd MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */ + +L(reconstruction): + /* Input: %eax=n, %xmm0=t */ + testl $2, %eax /* n&2 != 0? */ + jz L(sin_poly) + +/*L(cos_poly):*/ + /* Here if cos(x) calculated using cos(t) polynomial for |t|<Pi/4: + * y = t*t; z = y*y; + * s = sign(x) * (-1.0)^((n>>2)&1) + * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))) + */ + shrl $2, %eax /* n>>2 */ + mulsd %xmm0, %xmm0 /* y=t^2 */ + andl $1, %eax /* (n>>2)&1 */ + movaps %xmm0, %xmm1 /* y */ + mulsd %xmm0, %xmm0 /* z=t^4 */ + + movsd MO1(DP_C4), %xmm4 /* C4 */ + mulsd %xmm0, %xmm4 /* z*C4 */ + movsd MO1(DP_C3), %xmm3 /* C3 */ + mulsd %xmm0, %xmm3 /* z*C3 */ + addsd MO1(DP_C2), %xmm4 /* C2+z*C4 */ + mulsd %xmm0, %xmm4 /* z*(C2+z*C4) */ + lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */ + addsd MO1(DP_C1), %xmm3 /* C1+z*C3 */ + mulsd %xmm0, %xmm3 /* z*(C1+z*C3) */ + addsd MO1(DP_C0), %xmm4 /* C0+z*(C2+z*C4) */ + mulsd %xmm1, %xmm4 /* y*(C0+z*(C2+z*C4)) */ + + addsd %xmm4, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + addsd MO1(DP_ONES), %xmm3 + + mulsd MO2(DP_ONES,%eax,8), %xmm3 /* DP result */ + movsd %xmm3, 0(%esp) /* Move result from sse... */ + fldl 0(%esp) /* ...to FPU. */ + /* Return back 4 bytes of stack frame */ + lea 8(%esp), %esp + RETURN + + .p2align 4 +L(sin_poly): + /* Here if cos(x) calculated using sin(t) polynomial for |t|<Pi/4: + * y = t*t; z = y*y; + * s = sign(x) * (-1.0)^((n>>2)&1) + * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))) + */ + + movaps %xmm0, %xmm4 /* t */ + shrl $2, %eax /* n>>2 */ + mulsd %xmm0, %xmm0 /* y=t^2 */ + andl $1, %eax /* (n>>2)&1 */ + movaps %xmm0, %xmm1 /* y */ + mulsd %xmm0, %xmm0 /* z=t^4 */ + + movsd MO1(DP_S4), %xmm2 /* S4 */ + mulsd %xmm0, %xmm2 /* z*S4 */ + movsd MO1(DP_S3), %xmm3 /* S3 */ + mulsd %xmm0, %xmm3 /* z*S3 */ + lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */ + addsd MO1(DP_S2), %xmm2 /* S2+z*S4 */ + mulsd %xmm0, %xmm2 /* z*(S2+z*S4) */ + addsd MO1(DP_S1), %xmm3 /* S1+z*S3 */ + mulsd %xmm0, %xmm3 /* z*(S1+z*S3) */ + addsd MO1(DP_S0), %xmm2 /* S0+z*(S2+z*S4) */ + mulsd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */ + /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */ + mulsd MO2(DP_ONES,%eax,8), %xmm4 + addsd %xmm2, %xmm3 /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + mulsd %xmm4, %xmm3 + /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + addsd %xmm4, %xmm3 + movsd %xmm3, 0(%esp) /* Move result from sse... */ + fldl 0(%esp) /* ...to FPU. */ + /* Return back 4 bytes of stack frame */ + lea 8(%esp), %esp + RETURN + + .p2align 4 +L(large_args): + /* Here if |x|>=9*Pi/4 */ + cmpl $0x7f800000, %eax /* x is Inf or NaN? */ + jae L(arg_inf_or_nan) + + /* Here if finite |x|>=9*Pi/4 */ + cmpl $0x4b000000, %eax /* |x|<2^23? */ + jae L(very_large_args) + + /* Here if 9*Pi/4<=|x|<2^23 */ + movsd MO1(DP_INVPIO4), %xmm1 /* 1/(Pi/4) */ + mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */ + cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */ + addl $1, %eax /* k+1 */ + movl %eax, %edx + andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */ + cvtsi2sdl %edx, %xmm4 /* DP j */ + movsd MO1(DP_PIO4HI), %xmm2 /* -PIO4HI = high part of -Pi/4 */ + mulsd %xmm4, %xmm2 /* -j*PIO4HI */ + movsd MO1(DP_PIO4LO), %xmm3 /* -PIO4LO = low part of -Pi/4 */ + addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */ + addl $2, %eax /* n */ + mulsd %xmm3, %xmm4 /* j*PIO4LO */ + addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */ + jmp L(reconstruction) + + .p2align 4 +L(very_large_args): + /* Here if finite |x|>=2^23 */ + + /* bitpos = (ix>>23) - BIAS_32 + 59; */ + shrl $23, %eax /* eb = biased exponent of x */ + /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */ + subl $68, %eax + movl $28, %ecx /* %cl=28 */ + movl %eax, %edx /* bitpos copy */ + + /* j = bitpos/28; */ + div %cl /* j in register %al=%ax/%cl */ + movapd %xmm0, %xmm3 /* |x| */ + /* clear unneeded remainder from %ah */ + andl $0xff, %eax + + imull $28, %eax, %ecx /* j*28 */ + movsd MO1(DP_HI_MASK), %xmm4 /* DP_HI_MASK */ + movapd %xmm0, %xmm5 /* |x| */ + mulsd -2*8+MO2(_FPI,%eax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */ + movapd %xmm0, %xmm1 /* |x| */ + mulsd -1*8+MO2(_FPI,%eax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */ + mulsd 0*8+MO2(_FPI,%eax,8), %xmm0 /* tmp0 = FPI[j]*|x| */ + addl $19, %ecx /* j*28+19 */ + mulsd 1*8+MO2(_FPI,%eax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */ + cmpl %ecx, %edx /* bitpos>=j*28+19? */ + jl L(very_large_skip1) + + /* Here if bitpos>=j*28+19 */ + andpd %xmm3, %xmm4 /* HI(tmp3) */ + subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */ +L(very_large_skip1): + + movsd MO1(DP_2POW52), %xmm6 + movapd %xmm5, %xmm2 /* tmp2 copy */ + addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */ + movl $1, %edx + addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */ + movsd 8+MO1(DP_2POW52), %xmm4 + movd %xmm6, %eax /* k = I64_LO(tmp6); */ + addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */ + comisd %xmm5, %xmm4 /* tmp4 > tmp5? */ + jbe L(very_large_skip2) + + /* Here if tmp4 > tmp5 */ + subl $1, %eax /* k-- */ + addsd 8+MO1(DP_ONES), %xmm4 /* tmp4 -= 1.0 */ +L(very_large_skip2): + + andl %eax, %edx /* k&1 */ + subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */ + addsd MO2(DP_ZERONE,%edx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */ + addsd %xmm2, %xmm3 /* t += tmp2 */ + addsd %xmm3, %xmm0 /* t += tmp0 */ + addl $3, %eax /* n=k+3 */ + addsd %xmm1, %xmm0 /* t += tmp1 */ + mulsd MO1(DP_PIO4), %xmm0 /* t *= PI04 */ + + jmp L(reconstruction) /* end of very_large_args peth */ + + .p2align 4 +L(arg_less_pio4): + /* Here if |x|<Pi/4 */ + cmpl $0x3d000000, %eax /* |x|<2^-5? */ + jl L(arg_less_2pn5) + + /* Here if 2^-5<=|x|<Pi/4 */ + mulsd %xmm0, %xmm0 /* y=x^2 */ + movaps %xmm0, %xmm1 /* y */ + mulsd %xmm0, %xmm0 /* z=x^4 */ + movsd MO1(DP_C4), %xmm3 /* C4 */ + mulsd %xmm0, %xmm3 /* z*C4 */ + movsd MO1(DP_C3), %xmm5 /* C3 */ + mulsd %xmm0, %xmm5 /* z*C3 */ + addsd MO1(DP_C2), %xmm3 /* C2+z*C4 */ + mulsd %xmm0, %xmm3 /* z*(C2+z*C4) */ + addsd MO1(DP_C1), %xmm5 /* C1+z*C3 */ + mulsd %xmm0, %xmm5 /* z*(C1+z*C3) */ + addsd MO1(DP_C0), %xmm3 /* C0+z*(C2+z*C4) */ + mulsd %xmm1, %xmm3 /* y*(C0+z*(C2+z*C4)) */ + addsd %xmm5, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + /* 1.0 + y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + addsd MO1(DP_ONES), %xmm3 + cvtsd2ss %xmm3, %xmm3 /* SP result */ + +L(epilogue): + lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */ + movss %xmm3, 0(%esp) /* Move result from sse... */ + flds 0(%esp) /* ...to FPU. */ + /* Return back 4 bytes of stack frame */ + lea 4(%esp), %esp + RETURN + + .p2align 4 +L(arg_less_2pn5): + /* Here if |x|<2^-5 */ + cmpl $0x32000000, %eax /* |x|<2^-27? */ + jl L(arg_less_2pn27) + + /* Here if 2^-27<=|x|<2^-5 */ + mulsd %xmm0, %xmm0 /* DP x^2 */ + movsd MO1(DP_COS2_1), %xmm3 /* DP DP_COS2_1 */ + mulsd %xmm0, %xmm3 /* DP x^2*DP_COS2_1 */ + addsd MO1(DP_COS2_0), %xmm3 /* DP DP_COS2_0+x^2*DP_COS2_1 */ + mulsd %xmm0, %xmm3 /* DP x^2*DP_COS2_0+x^4*DP_COS2_1 */ + /* DP 1.0+x^2*DP_COS2_0+x^4*DP_COS2_1 */ + addsd MO1(DP_ONES), %xmm3 + cvtsd2ss %xmm3, %xmm3 /* SP result */ + jmp L(epilogue) + + .p2align 4 +L(arg_less_2pn27): + /* Here if |x|<2^-27 */ + movss ARG_X, %xmm0 /* x */ + andps MO1(SP_ABS_MASK),%xmm0 /* |x| */ + movss MO1(SP_ONE), %xmm3 /* 1.0 */ + subss %xmm0, %xmm3 /* result is 1.0-|x| */ + jmp L(epilogue) + + .p2align 4 +L(arg_inf_or_nan): + /* Here if |x| is Inf or NAN */ + jne L(skip_errno_setting) /* in case of x is NaN */ + + /* Here if x is Inf. Set errno to EDOM. */ + call JUMPTARGET(__errno_location) + movl $EDOM, (%eax) + + .p2align 4 +L(skip_errno_setting): + /* Here if |x| is Inf or NAN. Continued. */ + movss ARG_X, %xmm3 /* load x */ + subss %xmm3, %xmm3 /* Result is NaN */ + jmp L(epilogue) +END(__cosf_sse2) + + .section .rodata, "a" + .p2align 3 +L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */ + .long 0x00000000,0x00000000 + .long 0x54442d18,0x3fe921fb + .long 0x54442d18,0x3ff921fb + .long 0x7f3321d2,0x4002d97c + .long 0x54442d18,0x400921fb + .long 0x2955385e,0x400f6a7a + .long 0x7f3321d2,0x4012d97c + .long 0xe9bba775,0x4015fdbb + .long 0x54442d18,0x401921fb + .long 0xbeccb2bb,0x401c463a + .long 0x2955385e,0x401f6a7a + .type L(PIO4J), @object + ASM_SIZE_DIRECTIVE(L(PIO4J)) + + .p2align 3 +L(_FPI): /* 4/Pi broken into sum of positive DP values */ + .long 0x00000000,0x00000000 + .long 0x6c000000,0x3ff45f30 + .long 0x2a000000,0x3e3c9c88 + .long 0xa8000000,0x3c54fe13 + .long 0xd0000000,0x3aaf47d4 + .long 0x6c000000,0x38fbb81b + .long 0xe0000000,0x3714acc9 + .long 0x7c000000,0x3560e410 + .long 0x56000000,0x33bca2c7 + .long 0xac000000,0x31fbd778 + .long 0xe0000000,0x300b7246 + .long 0xe8000000,0x2e5d2126 + .long 0x48000000,0x2c970032 + .long 0xe8000000,0x2ad77504 + .long 0xe0000000,0x290921cf + .long 0xb0000000,0x274deb1c + .long 0xe0000000,0x25829a73 + .long 0xbe000000,0x23fd1046 + .long 0x10000000,0x2224baed + .long 0x8e000000,0x20709d33 + .long 0x80000000,0x1e535a2f + .long 0x64000000,0x1cef904e + .long 0x30000000,0x1b0d6398 + .long 0x24000000,0x1964ce7d + .long 0x16000000,0x17b908bf + .type L(_FPI), @object + ASM_SIZE_DIRECTIVE(L(_FPI)) + +/* Coefficients of polynomial + for cos(x)~=1.0+x^2*DP_COS2_0+x^4*DP_COS2_1, |x|<2^-5. */ + .p2align 3 +L(DP_COS2_0): + .long 0xff5cc6fd,0xbfdfffff + .type L(DP_COS2_0), @object + ASM_SIZE_DIRECTIVE(L(DP_COS2_0)) + + .p2align 3 +L(DP_COS2_1): + .long 0xb178dac5,0x3fa55514 + .type L(DP_COS2_1), @object + ASM_SIZE_DIRECTIVE(L(DP_COS2_1)) + + .p2align 3 +L(DP_ZERONE): + .long 0x00000000,0x00000000 /* 0.0 */ + .long 0x00000000,0xbff00000 /* 1.0 */ + .type L(DP_ZERONE),@object + ASM_SIZE_DIRECTIVE(L(DP_ZERONE)) + + .p2align 3 +L(DP_ONES): + .long 0x00000000,0x3ff00000 /* +1.0 */ + .long 0x00000000,0xbff00000 /* -1.0 */ + .type L(DP_ONES), @object + ASM_SIZE_DIRECTIVE(L(DP_ONES)) + +/* Coefficients of polynomial + for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */ + .p2align 3 +L(DP_S3): + .long 0x64e6b5b4,0x3ec71d72 + .type L(DP_S3), @object + ASM_SIZE_DIRECTIVE(L(DP_S3)) + + .p2align 3 +L(DP_S1): + .long 0x10c2688b,0x3f811111 + .type L(DP_S1), @object + ASM_SIZE_DIRECTIVE(L(DP_S1)) + + .p2align 3 +L(DP_S4): + .long 0x1674b58a,0xbe5a947e + .type L(DP_S4), @object + ASM_SIZE_DIRECTIVE(L(DP_S4)) + + .p2align 3 +L(DP_S2): + .long 0x8b4bd1f9,0xbf2a019f + .type L(DP_S2), @object + ASM_SIZE_DIRECTIVE(L(DP_S2)) + + .p2align 3 +L(DP_S0): + .long 0x55551cd9,0xbfc55555 + .type L(DP_S0), @object + ASM_SIZE_DIRECTIVE(L(DP_S0)) + +/* Coefficients of polynomial + for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */ + .p2align 3 +L(DP_C3): + .long 0x9ac43cc0,0x3efa00eb + .type L(DP_C3), @object + ASM_SIZE_DIRECTIVE(L(DP_C3)) + + .p2align 3 +L(DP_C1): + .long 0x545c50c7,0x3fa55555 + .type L(DP_C1), @object + ASM_SIZE_DIRECTIVE(L(DP_C1)) + + .p2align 3 +L(DP_C4): + .long 0xdd8844d7,0xbe923c97 + .type L(DP_C4), @object + ASM_SIZE_DIRECTIVE(L(DP_C4)) + + .p2align 3 +L(DP_C2): + .long 0x348b6874,0xbf56c16b + .type L(DP_C2), @object + ASM_SIZE_DIRECTIVE(L(DP_C2)) + + .p2align 3 +L(DP_C0): + .long 0xfffe98ae,0xbfdfffff + .type L(DP_C0), @object + ASM_SIZE_DIRECTIVE(L(DP_C0)) + + .p2align 3 +L(DP_PIO4): + .long 0x54442d18,0x3fe921fb /* Pi/4 */ + .type L(DP_PIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4)) + + .p2align 3 +L(DP_2POW52): + .long 0x00000000,0x43300000 /* +2^52 */ + .long 0x00000000,0xc3300000 /* -2^52 */ + .type L(DP_2POW52), @object + ASM_SIZE_DIRECTIVE(L(DP_2POW52)) + + .p2align 3 +L(DP_INVPIO4): + .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */ + .type L(DP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_INVPIO4)) + + .p2align 3 +L(DP_PIO4HI): + .long 0x54000000,0xbfe921fb /* High part of Pi/4 */ + .type L(DP_PIO4HI), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4HI)) + + .p2align 3 +L(DP_PIO4LO): + .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */ + .type L(DP_PIO4LO), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4LO)) + + .p2align 2 +L(SP_INVPIO4): + .long 0x3fa2f983 /* 4/Pi */ + .type L(SP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(SP_INVPIO4)) + + .p2align 4 +L(DP_ABS_MASK): /* Mask for getting DP absolute value */ + .long 0xffffffff,0x7fffffff + .long 0xffffffff,0x7fffffff + .type L(DP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK)) + + .p2align 3 +L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */ + .long 0x00000000,0xffffffff + .type L(DP_HI_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_HI_MASK)) + + .p2align 4 +L(SP_ABS_MASK): /* Mask for getting SP absolute value */ + .long 0x7fffffff,0x7fffffff + .long 0x7fffffff,0x7fffffff + .type L(SP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK)) + + .p2align 2 +L(SP_ONE): + .long 0x3f800000 /* 1.0 */ + .type L(SP_ONE), @object + ASM_SIZE_DIRECTIVE(L(SP_ONE)) + +weak_alias (__cosf, cosf) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c new file mode 100644 index 0000000000..af588de9dc --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_cosf.c @@ -0,0 +1,29 @@ +/* Multiple versions of cosf + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern float __cosf_sse2 (float); +extern float __cosf_ia32 (float); +float __cosf (float); + +libm_ifunc (__cosf, HAS_CPU_FEATURE (SSE2) ? __cosf_sse2 : __cosf_ia32); +weak_alias (__cosf, cosf); + +#define COSF __cosf_ia32 +#include <sysdeps/ieee754/flt-32/s_cosf.c> diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S new file mode 100644 index 0000000000..f31a925522 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf-sse2.S @@ -0,0 +1,586 @@ +/* Optimized with sse2 version of sincosf + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define __need_Emath +#include <bits/errno.h> + +/* Short algorithm description: + * + * 1) if |x|==0: sin(x)=x, + * cos(x)=1. + * 2) if |x|<2^-27: sin(x)=x-x*DP_SMALL, raising underflow only when needed, + * cos(x)=1-|x|. + * 3) if |x|<2^-5 : sin(x)=x+x*x^2*DP_SIN2_0+x^5*DP_SIN2_1, + * cos(x)=1+1*x^2*DP_COS2_0+x^5*DP_COS2_1 + * 4) if |x|< Pi/4: sin(x)=x+x*x^2*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))), + * cos(x)=1+1*x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))). + * 5) if |x| < 9*Pi/4: + * 5.1) Range reduction: + * k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, t=|x|-j*Pi/4. + * 5.2) Reconstruction: + * sign_sin = sign(x) * (-1.0)^(( n >>2)&1) + * sign_cos = (-1.0)^(((n+2)>>2)&1) + * poly_sin = ((((S4*t^2 + S3)*t^2 + S2)*t^2 + S1)*t^2 + S0)*t^2*t+t + * poly_cos = ((((C4*t^2 + C3)*t^2 + C2)*t^2 + C1)*t^2 + C0)*t^2*s+s + * if(n&2 != 0) { + * using cos(t) and sin(t) polynomials for |t|<Pi/4, results are + * cos(x) = poly_sin * sign_cos + * sin(x) = poly_cos * sign_sin + * } else { + * sin(x) = poly_sin * sign_sin + * cos(x) = poly_cos * sign_cos + * } + * 6) if |x| < 2^23, large args: + * 6.1) Range reduction: + * k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4 + * 6.2) Reconstruction same as (5.2). + * 7) if |x| >= 2^23, very large args: + * 7.1) Range reduction: + * k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, t=|x|-j*Pi/4. + * 7.2) Reconstruction same as (5.2). + * 8) if x is Inf, return x-x, and set errno=EDOM. + * 9) if x is NaN, return x-x. + * + * Special cases: + * sin/cos(+-0) = +-0/1 not raising inexact/underflow, + * sin/cos(subnormal) raises inexact/underflow, + * sin/cos(min_normalized) raises inexact/underflow, + * sin/cos(normalized) raises inexact, + * sin/cos(Inf) = NaN, raises invalid, sets errno to EDOM, + * sin/cos(NaN) = NaN. + */ + +#ifdef PIC +# define MO1(symbol) L(symbol)##@GOTOFF(%ebx) +# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%ebx,reg2,_scale) +# define CFI_PUSH(REG) cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0) +# define CFI_POP(REG) cfi_adjust_cfa_offset(-4); cfi_restore(REG) +# define PUSH(REG) pushl REG; CFI_PUSH(REG) +# define POP(REG) popl REG; CFI_POP(REG) +# define ENTRANCE PUSH(%ebx); LOAD_PIC_REG(bx) +# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx) +# define ARG_X 8(%esp) +# define ARG_SIN_PTR 12(%esp) +# define ARG_COS_PTR 16(%esp) +#else +# define MO1(symbol) L(symbol) +# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale) +# define ENTRANCE +# define RETURN ret +# define ARG_X 4(%esp) +# define ARG_SIN_PTR 8(%esp) +# define ARG_COS_PTR 12(%esp) +#endif + + .text +ENTRY(__sincosf_sse2) + /* Input: single precision x on stack at address ARG_X */ + /* pointer to sin result on stack at address ARG_SIN_PTR */ + /* pointer to cos result on stack at address ARG_COS_PTR */ + + ENTRANCE + movl ARG_X, %eax /* Bits of x */ + cvtss2sd ARG_X, %xmm0 /* DP x */ + andl $0x7fffffff, %eax /* |x| */ + + cmpl $0x3f490fdb, %eax /* |x|<Pi/4 ? */ + jb L(arg_less_pio4) + + /* Here if |x|>=Pi/4 */ + movd %eax, %xmm3 /* SP |x| */ + andpd MO1(DP_ABS_MASK),%xmm0 /* DP |x| */ + movss MO1(SP_INVPIO4), %xmm2 /* SP 1/(Pi/4) */ + + cmpl $0x40e231d6, %eax /* |x|<9*Pi/4 ? */ + jae L(large_args) + + /* Here if Pi/4<=|x|<9*Pi/4 */ + mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */ + movl ARG_X, %ecx /* Load x */ + cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */ + shrl $29, %ecx /* (sign of x) << 2 */ + addl $1, %eax /* k+1 */ + movl $0x0e, %edx + andl %eax, %edx /* j = (k+1)&0x0e */ + subsd MO2(PIO4J,%edx,8), %xmm0/* t = |x| - j * Pi/4 */ + +L(reconstruction): + /* Input: %eax=n, %xmm0=t, %ecx=sign(x) */ + + movaps %xmm0, %xmm4 /* t */ + movhpd MO1(DP_ONES), %xmm4 /* 1|t */ + mulsd %xmm0, %xmm0 /* y=t^2 */ + movl $2, %edx + unpcklpd %xmm0, %xmm0 /* y|y */ + addl %eax, %edx /* k+2 */ + movaps %xmm0, %xmm1 /* y|y */ + mulpd %xmm0, %xmm0 /* z=t^4|z=t^4 */ + + movaps MO1(DP_SC4), %xmm2 /* S4 */ + mulpd %xmm0, %xmm2 /* z*S4 */ + movaps MO1(DP_SC3), %xmm3 /* S3 */ + mulpd %xmm0, %xmm3 /* z*S3 */ + xorl %eax, %ecx /* (sign_x ^ (k>>2))<<2 */ + addpd MO1(DP_SC2), %xmm2 /* S2+z*S4 */ + mulpd %xmm0, %xmm2 /* z*(S2+z*S4) */ + shrl $2, %edx /* (k+2)>>2 */ + addpd MO1(DP_SC1), %xmm3 /* S1+z*S3 */ + mulpd %xmm0, %xmm3 /* z*(S1+z*S3) */ + shrl $2, %ecx /* sign_x ^ k>>2 */ + addpd MO1(DP_SC0), %xmm2 /* S0+z*(S2+z*S4) */ + andl $1, %edx /* sign_cos = ((k+2)>>2)&1 */ + mulpd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */ + andl $1, %ecx /* sign_sin = sign_x ^ ((k>>2)&1) */ + addpd %xmm2, %xmm3 /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + mulpd %xmm4, %xmm3 /*t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/ + testl $2, %eax /* n&2 != 0 ? */ + addpd %xmm4, %xmm3 /*t+t*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/ + jnz L(sin_result_sin_poly) + +/*L(sin_result_cos_poly):*/ + /* + * Here if + * cos(x) = poly_sin * sign_cos + * sin(x) = poly_cos * sign_sin + */ + movsd MO2(DP_ONES,%ecx,8), %xmm4/* 0|sign_sin */ + movhpd MO2(DP_ONES,%edx,8), %xmm4/* sign_cos|sign_sin */ + mulpd %xmm4, %xmm3 /* result_cos|result_sin */ + movl ARG_SIN_PTR, %eax + cvtpd2ps %xmm3, %xmm0 /* SP results */ + movl ARG_COS_PTR, %ecx + movss %xmm0, (%eax) /* store sin(x) from xmm0[0] */ + shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */ + movss %xmm0, (%ecx) /* store cos(x) */ + RETURN + + .p2align 4 +L(sin_result_sin_poly): + /* + * Here if + * sin(x) = poly_sin * sign_sin + * cos(x) = poly_cos * sign_cos + */ + movsd MO2(DP_ONES,%edx,8), %xmm4/* 0|sign_cos */ + movhpd MO2(DP_ONES,%ecx,8), %xmm4/* sign_sin|sign_cos */ + mulpd %xmm4, %xmm3 /* result_sin|result_cos */ + movl ARG_SIN_PTR, %eax + cvtpd2ps %xmm3, %xmm0 /* SP results */ + movl ARG_COS_PTR, %ecx + movss %xmm0, (%ecx) /* store cos(x) from xmm0[0] */ + shufps $1, %xmm0, %xmm0 /* move sin(x) to xmm0[0] */ + movss %xmm0, (%eax) /* store sin(x) */ + RETURN + + .p2align 4 +L(large_args): + /* Here if |x|>=9*Pi/4 */ + cmpl $0x7f800000, %eax /* x is Inf or NaN ? */ + jae L(arg_inf_or_nan) + + /* Here if finite |x|>=9*Pi/4 */ + cmpl $0x4b000000, %eax /* |x|<2^23 ? */ + jae L(very_large_args) + + /* Here if 9*Pi/4<=|x|<2^23 */ + movsd MO1(DP_INVPIO4), %xmm1 /* 1/(Pi/4) */ + mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */ + cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */ + addl $1, %eax /* k+1 */ + movl %eax, %edx + andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */ + cvtsi2sdl %edx, %xmm4 /* DP j */ + movl ARG_X, %ecx /* Load x */ + movsd MO1(DP_PIO4HI), %xmm2 /* -PIO4HI = high part of -Pi/4 */ + shrl $29, %ecx /* (sign of x) << 2 */ + mulsd %xmm4, %xmm2 /* -j*PIO4HI */ + movsd MO1(DP_PIO4LO), %xmm3 /* -PIO4LO = low part of -Pi/4 */ + addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */ + mulsd %xmm3, %xmm4 /* j*PIO4LO */ + addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */ + jmp L(reconstruction) + + .p2align 4 +L(very_large_args): + /* Here if finite |x|>=2^23 */ + + /* bitpos = (ix>>23) - BIAS_32 + 59; */ + shrl $23, %eax /* eb = biased exponent of x */ + subl $68, %eax /* bitpos=eb-0x7f+59, where 0x7f */ + /*is exponent bias */ + movl $28, %ecx /* %cl=28 */ + movl %eax, %edx /* bitpos copy */ + + /* j = bitpos/28; */ + div %cl /* j in register %al=%ax/%cl */ + movapd %xmm0, %xmm3 /* |x| */ + andl $0xff, %eax /* clear unneeded remainder from %ah*/ + + imull $28, %eax, %ecx /* j*28 */ + movsd MO1(DP_HI_MASK), %xmm4 /* DP_HI_MASK */ + movapd %xmm0, %xmm5 /* |x| */ + mulsd -2*8+MO2(_FPI,%eax,8), %xmm3/* tmp3 = FPI[j-2]*|x| */ + movapd %xmm0, %xmm1 /* |x| */ + mulsd -1*8+MO2(_FPI,%eax,8), %xmm5/* tmp2 = FPI[j-1]*|x| */ + mulsd 0*8+MO2(_FPI,%eax,8), %xmm0/* tmp0 = FPI[j]*|x| */ + addl $19, %ecx /* j*28+19 */ + mulsd 1*8+MO2(_FPI,%eax,8), %xmm1/* tmp1 = FPI[j+1]*|x| */ + cmpl %ecx, %edx /* bitpos>=j*28+19 ? */ + jl L(very_large_skip1) + + /* Here if bitpos>=j*28+19 */ + andpd %xmm3, %xmm4 /* HI(tmp3) */ + subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */ +L(very_large_skip1): + + movsd MO1(DP_2POW52), %xmm6 + movapd %xmm5, %xmm2 /* tmp2 copy */ + addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */ + movl $1, %edx + addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */ + movsd 8+MO1(DP_2POW52), %xmm4 + movd %xmm6, %eax /* k = I64_LO(tmp6); */ + addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */ + movl ARG_X, %ecx /* Load x */ + comisd %xmm5, %xmm4 /* tmp4 > tmp5 ? */ + jbe L(very_large_skip2) + + /* Here if tmp4 > tmp5 */ + subl $1, %eax /* k-- */ + addsd 8+MO1(DP_ONES), %xmm4 /* tmp4 -= 1.0 */ +L(very_large_skip2): + + andl %eax, %edx /* k&1 */ + subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */ + addsd MO2(DP_ZERONE,%edx,8), %xmm3/* t = DP_ZERONE[k&1] + tmp3 */ + addsd %xmm2, %xmm3 /* t += tmp2 */ + shrl $29, %ecx /* (sign of x) << 2 */ + addsd %xmm3, %xmm0 /* t += tmp0 */ + addl $1, %eax /* n=k+1 */ + addsd %xmm1, %xmm0 /* t += tmp1 */ + mulsd MO1(DP_PIO4), %xmm0 /* t *= PI04 */ + + jmp L(reconstruction) /* end of very_large_args peth */ + + .p2align 4 +L(arg_less_pio4): + /* Here if |x|<Pi/4 */ + cmpl $0x3d000000, %eax /* |x|<2^-5 ? */ + jl L(arg_less_2pn5) + + /* Here if 2^-5<=|x|<Pi/4 */ + movaps %xmm0, %xmm3 /* DP x */ + movhpd MO1(DP_ONES), %xmm3 /* DP 1|x */ + mulsd %xmm0, %xmm0 /* DP y=x^2 */ + unpcklpd %xmm0, %xmm0 /* DP y|y */ + movaps %xmm0, %xmm1 /* y|y */ + mulpd %xmm0, %xmm0 /* z=x^4|z=x^4 */ + + movapd MO1(DP_SC4), %xmm4 /* S4 */ + mulpd %xmm0, %xmm4 /* z*S4 */ + movapd MO1(DP_SC3), %xmm5 /* S3 */ + mulpd %xmm0, %xmm5 /* z*S3 */ + addpd MO1(DP_SC2), %xmm4 /* S2+z*S4 */ + mulpd %xmm0, %xmm4 /* z*(S2+z*S4) */ + addpd MO1(DP_SC1), %xmm5 /* S1+z*S3 */ + mulpd %xmm0, %xmm5 /* z*(S1+z*S3) */ + addpd MO1(DP_SC0), %xmm4 /* S0+z*(S2+z*S4) */ + mulpd %xmm1, %xmm4 /* y*(S0+z*(S2+z*S4)) */ + mulpd %xmm3, %xmm5 /* x*z*(S1+z*S3) */ + mulpd %xmm3, %xmm4 /* x*y*(S0+z*(S2+z*S4)) */ + addpd %xmm5, %xmm4 /*x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))))*/ + movl ARG_SIN_PTR, %eax + addpd %xmm4, %xmm3 /*x+x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4))*/ + movl ARG_COS_PTR, %ecx + cvtpd2ps %xmm3, %xmm0 /* SP results */ + movss %xmm0, (%eax) /* store sin(x) from xmm0[0] */ + shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */ + movss %xmm0, (%ecx) /* store cos(x) */ + RETURN + + .p2align 4 +L(arg_less_2pn5): + /* Here if |x|<2^-5 */ + cmpl $0x32000000, %eax /* |x|<2^-27 ? */ + jl L(arg_less_2pn27) + + /* Here if 2^-27<=|x|<2^-5 */ + movaps %xmm0, %xmm1 /* DP x */ + movhpd MO1(DP_ONES), %xmm1 /* DP 1|x */ + mulsd %xmm0, %xmm0 /* DP x^2 */ + unpcklpd %xmm0, %xmm0 /* DP x^2|x^2 */ + + movaps MO1(DP_SINCOS2_1), %xmm3/* DP DP_SIN2_1 */ + mulpd %xmm0, %xmm3 /* DP x^2*DP_SIN2_1 */ + addpd MO1(DP_SINCOS2_0), %xmm3/* DP DP_SIN2_0+x^2*DP_SIN2_1 */ + mulpd %xmm0, %xmm3 /* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */ + mulpd %xmm1, %xmm3 /* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */ + addpd %xmm1, %xmm3 /* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */ + movl ARG_SIN_PTR, %eax + cvtpd2ps %xmm3, %xmm0 /* SP results */ + movl ARG_COS_PTR, %ecx + movss %xmm0, (%eax) /* store sin(x) from xmm0[0] */ + shufps $1, %xmm0, %xmm0 /* move cos(x) to xmm0[0] */ + movss %xmm0, (%ecx) /* store cos(x) */ + RETURN + + .p2align 4 +L(arg_less_2pn27): + movss ARG_X, %xmm7 /* SP x */ + cmpl $0, %eax /* x=0 ? */ + je L(arg_zero) /* in case x=0 return sin(+-0)==+-0 */ + /* Here if |x|<2^-27 */ + /* + * Special cases here: + * sin(subnormal) raises inexact/underflow + * sin(min_normalized) raises inexact/underflow + * sin(normalized) raises inexact + * cos(here)=1-|x| (raising inexact) + */ + movaps %xmm0, %xmm3 /* DP x */ + mulsd MO1(DP_SMALL), %xmm0 /* DP x*DP_SMALL */ + subsd %xmm0, %xmm3 /* DP sin result is x-x*DP_SMALL */ + andps MO1(SP_ABS_MASK), %xmm7 /* SP |x| */ + cvtsd2ss %xmm3, %xmm0 /* sin(x) */ + movl ARG_SIN_PTR, %eax + movss MO1(SP_ONE), %xmm1 /* SP 1.0 */ + movss %xmm0, (%eax) /* sin(x) store */ + movl ARG_COS_PTR, %ecx + subss %xmm7, %xmm1 /* cos(x) */ + movss %xmm1, (%ecx) /* cos(x) store */ + RETURN + + .p2align 4 +L(arg_zero): + movss MO1(SP_ONE), %xmm0 /* 1.0 */ + movl ARG_SIN_PTR, %eax + movl ARG_COS_PTR, %ecx + movss %xmm7, (%eax) /* sin(+-0)==x */ + movss %xmm0, (%ecx) /* cos(+-0)==1 */ + RETURN + + .p2align 4 +L(arg_inf_or_nan): + movss ARG_X, %xmm7 /* SP x */ + /* Here if |x| is Inf or NAN */ + jne L(skip_errno_setting) /* in case of x is NaN */ + + /* Here if x is Inf. Set errno to EDOM. */ + call JUMPTARGET(__errno_location) + movl $EDOM, (%eax) + + .p2align 4 +L(skip_errno_setting): + /* Here if |x| is Inf or NAN. Continued. */ + subss %xmm7, %xmm7 /* x-x, result is NaN */ + movl ARG_SIN_PTR, %eax + movl ARG_COS_PTR, %ecx + movss %xmm7, (%eax) + movss %xmm7, (%ecx) + RETURN +END(__sincosf_sse2) + + .section .rodata, "a" + .p2align 3 +L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */ + .long 0x00000000,0x00000000 + .long 0x54442d18,0x3fe921fb + .long 0x54442d18,0x3ff921fb + .long 0x7f3321d2,0x4002d97c + .long 0x54442d18,0x400921fb + .long 0x2955385e,0x400f6a7a + .long 0x7f3321d2,0x4012d97c + .long 0xe9bba775,0x4015fdbb + .long 0x54442d18,0x401921fb + .long 0xbeccb2bb,0x401c463a + .long 0x2955385e,0x401f6a7a + .type L(PIO4J), @object + ASM_SIZE_DIRECTIVE(L(PIO4J)) + + .p2align 3 +L(_FPI): /* 4/Pi broken into sum of positive DP values */ + .long 0x00000000,0x00000000 + .long 0x6c000000,0x3ff45f30 + .long 0x2a000000,0x3e3c9c88 + .long 0xa8000000,0x3c54fe13 + .long 0xd0000000,0x3aaf47d4 + .long 0x6c000000,0x38fbb81b + .long 0xe0000000,0x3714acc9 + .long 0x7c000000,0x3560e410 + .long 0x56000000,0x33bca2c7 + .long 0xac000000,0x31fbd778 + .long 0xe0000000,0x300b7246 + .long 0xe8000000,0x2e5d2126 + .long 0x48000000,0x2c970032 + .long 0xe8000000,0x2ad77504 + .long 0xe0000000,0x290921cf + .long 0xb0000000,0x274deb1c + .long 0xe0000000,0x25829a73 + .long 0xbe000000,0x23fd1046 + .long 0x10000000,0x2224baed + .long 0x8e000000,0x20709d33 + .long 0x80000000,0x1e535a2f + .long 0x64000000,0x1cef904e + .long 0x30000000,0x1b0d6398 + .long 0x24000000,0x1964ce7d + .long 0x16000000,0x17b908bf + .type L(_FPI), @object + ASM_SIZE_DIRECTIVE(L(_FPI)) + +/* Coefficients of polynomials for */ +/* sin(x)~=x+x*x^2*(DP_SIN2_0+x^2*DP_SIN2_1) in low DP part, */ +/* cos(x)~=1+1*x^2*(DP_COS2_0+x^2*DP_COS2_1) in high DP part, */ +/* for |x|<2^-5. */ + .p2align 4 +L(DP_SINCOS2_0): + .long 0x5543d49d,0xbfc55555 + .long 0xff5cc6fd,0xbfdfffff + .type L(DP_SINCOS2_0), @object + ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_0)) + + .p2align 4 +L(DP_SINCOS2_1): + .long 0x75cec8c5,0x3f8110f4 + .long 0xb178dac5,0x3fa55514 + .type L(DP_SINCOS2_1), @object + ASM_SIZE_DIRECTIVE(L(DP_SINCOS2_1)) + + .p2align 3 +L(DP_ZERONE): + .long 0x00000000,0x00000000 /* 0.0 */ + .long 0x00000000,0xbff00000 /* 1.0 */ + .type L(DP_ZERONE), @object + ASM_SIZE_DIRECTIVE(L(DP_ZERONE)) + + .p2align 3 +L(DP_ONES): + .long 0x00000000,0x3ff00000 /* +1.0 */ + .long 0x00000000,0xbff00000 /* -1.0 */ + .type L(DP_ONES), @object + ASM_SIZE_DIRECTIVE(L(DP_ONES)) + +/* Coefficients of polynomials for */ +/* sin(t)~=t+t*t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))) in low DP part, */ +/* cos(t)~=1+1*t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))) in high DP part, */ +/* for |t|<Pi/4. */ + .p2align 4 +L(DP_SC4): + .long 0x1674b58a,0xbe5a947e + .long 0xdd8844d7,0xbe923c97 + .type L(DP_SC4), @object + ASM_SIZE_DIRECTIVE(L(DP_SC4)) + + .p2align 4 +L(DP_SC3): + .long 0x64e6b5b4,0x3ec71d72 + .long 0x9ac43cc0,0x3efa00eb + .type L(DP_SC3), @object + ASM_SIZE_DIRECTIVE(L(DP_SC3)) + + .p2align 4 +L(DP_SC2): + .long 0x8b4bd1f9,0xbf2a019f + .long 0x348b6874,0xbf56c16b + .type L(DP_SC2), @object + ASM_SIZE_DIRECTIVE(L(DP_SC2)) + + .p2align 4 +L(DP_SC1): + .long 0x10c2688b,0x3f811111 + .long 0x545c50c7,0x3fa55555 + .type L(DP_SC1), @object + ASM_SIZE_DIRECTIVE(L(DP_SC1)) + + .p2align 4 +L(DP_SC0): + .long 0x55551cd9,0xbfc55555 + .long 0xfffe98ae,0xbfdfffff + .type L(DP_SC0), @object + ASM_SIZE_DIRECTIVE(L(DP_SC0)) + + .p2align 3 +L(DP_SMALL): + .long 0x00000000,0x3cd00000 /* 2^(-50) */ + .type L(DP_SMALL), @object + ASM_SIZE_DIRECTIVE(L(DP_SMALL)) + + .p2align 3 +L(DP_PIO4): + .long 0x54442d18,0x3fe921fb /* Pi/4 */ + .type L(DP_PIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4)) + + .p2align 3 +L(DP_2POW52): + .long 0x00000000,0x43300000 /* +2^52 */ + .long 0x00000000,0xc3300000 /* -2^52 */ + .type L(DP_2POW52), @object + ASM_SIZE_DIRECTIVE(L(DP_2POW52)) + + .p2align 3 +L(DP_INVPIO4): + .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */ + .type L(DP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_INVPIO4)) + + .p2align 3 +L(DP_PIO4HI): + .long 0x54000000,0xbfe921fb /* High part of Pi/4 */ + .type L(DP_PIO4HI), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4HI)) + + .p2align 3 +L(DP_PIO4LO): + .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */ + .type L(DP_PIO4LO), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4LO)) + + .p2align 2 +L(SP_INVPIO4): + .long 0x3fa2f983 /* 4/Pi */ + .type L(SP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(SP_INVPIO4)) + + .p2align 4 +L(DP_ABS_MASK): /* Mask for getting DP absolute value */ + .long 0xffffffff,0x7fffffff + .long 0xffffffff,0x7fffffff + .type L(DP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK)) + + .p2align 3 +L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */ + .long 0x00000000,0xffffffff + .type L(DP_HI_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_HI_MASK)) + + .p2align 4 +L(SP_ABS_MASK): /* Mask for getting SP absolute value */ + .long 0x7fffffff,0x7fffffff + .long 0x7fffffff,0x7fffffff + .type L(SP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(SP_ABS_MASK)) + + .p2align 2 +L(SP_ONE): + .long 0x3f800000 /* 1.0 */ + .type L(SP_ONE), @object + ASM_SIZE_DIRECTIVE(L(SP_ONE)) + +weak_alias(__sincosf, sincosf) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c new file mode 100644 index 0000000000..9428f9b4ea --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sincosf.c @@ -0,0 +1,30 @@ +/* Multiple versions of sincosf + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern void __sincosf_sse2 (float, float *, float *); +extern void __sincosf_ia32 (float, float *, float *); +void __sincosf (float, float *, float *); + +libm_ifunc (__sincosf, + HAS_CPU_FEATURE (SSE2) ? __sincosf_sse2 : __sincosf_ia32); +weak_alias (__sincosf, sincosf); + +#define SINCOSF __sincosf_ia32 +#include <sysdeps/ieee754/flt-32/s_sincosf.c> diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S new file mode 100644 index 0000000000..ee96018061 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf-sse2.S @@ -0,0 +1,566 @@ +/* Optimized with sse2 version of sinf + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define __need_Emath +#include <bits/errno.h> + +/* Short algorithm description: + * + * 1) if |x| == 0: return x. + * 2) if |x| < 2^-27: return x-x*DP_SMALL, raise underflow only when needed. + * 3) if |x| < 2^-5 : return x+x^3*DP_SIN2_0+x^5*DP_SIN2_1. + * 4) if |x| < Pi/4: return x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). + * 5) if |x| < 9*Pi/4: + * 5.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0x0e, n=k+1, + * t=|x|-j*Pi/4. + * 5.2) Reconstruction: + * s = sign(x) * (-1.0)^((n>>2)&1) + * if(n&2 != 0) { + * using cos(t) polynomial for |t|<Pi/4, result is + * s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))). + * } else { + * using sin(t) polynomial for |t|<Pi/4, result is + * s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))). + * } + * 6) if |x| < 2^23, large args: + * 6.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, + * t=|x|-j*Pi/4. + * 6.2) Reconstruction same as (5.2). + * 7) if |x| >= 2^23, very large args: + * 7.1) Range reduction: k=trunc(|x|/(Pi/4)), j=(k+1)&0xfffffffe, n=k+1, + * t=|x|-j*Pi/4. + * 7.2) Reconstruction same as (5.2). + * 8) if x is Inf, return x-x, and set errno=EDOM. + * 9) if x is NaN, return x-x. + * + * Special cases: + * sin(+-0) = +-0 not raising inexact/underflow, + * sin(subnormal) raises inexact/underflow, + * sin(min_normalized) raises inexact/underflow, + * sin(normalized) raises inexact, + * sin(Inf) = NaN, raises invalid, sets errno to EDOM, + * sin(NaN) = NaN. + */ + +#ifdef PIC +# define MO1(symbol) L(symbol)##@GOTOFF(%ebx) +# define MO2(symbol,reg2,_scale) L(symbol)##@GOTOFF(%ebx,reg2,_scale) +# define CFI_PUSH(REG) cfi_adjust_cfa_offset(4); cfi_rel_offset(REG,0) +# define CFI_POP(REG) cfi_adjust_cfa_offset(-4); cfi_restore(REG) +# define PUSH(REG) pushl REG; CFI_PUSH(REG) +# define POP(REG) popl REG; CFI_POP(REG) +# define ENTRANCE PUSH(%ebx); LOAD_PIC_REG(bx) +# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx) +# define ARG_X 8(%esp) +#else +# define MO1(symbol) L(symbol) +# define MO2(symbol,reg2,_scale) L(symbol)(,reg2,_scale) +# define ENTRANCE +# define RETURN ret +# define ARG_X 4(%esp) +#endif + + .text +ENTRY(__sinf_sse2) + /* Input: single precision x on stack at address ARG_X */ + + ENTRANCE + movl ARG_X, %eax /* Bits of x */ + cvtss2sd ARG_X, %xmm0 /* DP x */ + andl $0x7fffffff, %eax /* |x| */ + + cmpl $0x3f490fdb, %eax /* |x|<Pi/4? */ + jb L(arg_less_pio4) + + /* Here if |x|>=Pi/4 */ + movd %eax, %xmm3 /* SP |x| */ + andpd MO1(DP_ABS_MASK),%xmm0 /* DP |x| */ + movss MO1(SP_INVPIO4), %xmm2 /* SP 1/(Pi/4) */ + + cmpl $0x40e231d6, %eax /* |x|<9*Pi/4? */ + jae L(large_args) + + /* Here if Pi/4<=|x|<9*Pi/4 */ + mulss %xmm3, %xmm2 /* SP |x|/(Pi/4) */ + movl ARG_X, %ecx /* Load x */ + cvttss2si %xmm2, %eax /* k, number of Pi/4 in x */ + shrl $31, %ecx /* sign of x */ + addl $1, %eax /* k+1 */ + movl $0x0e, %edx + andl %eax, %edx /* j = (k+1)&0x0e */ + subsd MO2(PIO4J,%edx,8), %xmm0 /* t = |x| - j * Pi/4 */ + +L(reconstruction): + /* Input: %eax=n, %xmm0=t, %ecx=sign(x) */ + testl $2, %eax /* n&2 != 0? */ + jz L(sin_poly) + +/*L(cos_poly):*/ + /* Here if sin(x) calculated using cos(t) polynomial for |t|<Pi/4: + * y = t*t; z = y*y; + * s = sign(x) * (-1.0)^((n>>2)&1) + * result = s * (1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4))))) + */ + shrl $2, %eax /* n>>2 */ + mulsd %xmm0, %xmm0 /* y=t^2 */ + andl $1, %eax /* (n>>2)&1 */ + movaps %xmm0, %xmm1 /* y */ + mulsd %xmm0, %xmm0 /* z=t^4 */ + + movsd MO1(DP_C4), %xmm4 /* C4 */ + mulsd %xmm0, %xmm4 /* z*C4 */ + xorl %eax, %ecx /* (-1.0)^((n>>2)&1) XOR sign(x) */ + movsd MO1(DP_C3), %xmm3 /* C3 */ + mulsd %xmm0, %xmm3 /* z*C3 */ + addsd MO1(DP_C2), %xmm4 /* C2+z*C4 */ + mulsd %xmm0, %xmm4 /* z*(C2+z*C4) */ + lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */ + addsd MO1(DP_C1), %xmm3 /* C1+z*C3 */ + mulsd %xmm0, %xmm3 /* z*(C1+z*C3) */ + addsd MO1(DP_C0), %xmm4 /* C0+z*(C2+z*C4) */ + mulsd %xmm1, %xmm4 /* y*(C0+z*(C2+z*C4)) */ + + addsd %xmm4, %xmm3 /* y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + /* 1.0+y*(C0+y*(C1+y*(C2+y*(C3+y*C4)))) */ + addsd MO1(DP_ONES), %xmm3 + + mulsd MO2(DP_ONES,%ecx,8), %xmm3 /* DP result */ + movsd %xmm3, 0(%esp) /* Move result from sse... */ + fldl 0(%esp) /* ...to FPU. */ + /* Return back 4 bytes of stack frame */ + lea 8(%esp), %esp + RETURN + + .p2align 4 +L(sin_poly): + /* Here if sin(x) calculated using sin(t) polynomial for |t|<Pi/4: + * y = t*t; z = y*y; + * s = sign(x) * (-1.0)^((n>>2)&1) + * result = s * t * (1.0+t^2*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4))))) + */ + + movaps %xmm0, %xmm4 /* t */ + shrl $2, %eax /* n>>2 */ + mulsd %xmm0, %xmm0 /* y=t^2 */ + andl $1, %eax /* (n>>2)&1 */ + movaps %xmm0, %xmm1 /* y */ + xorl %eax, %ecx /* (-1.0)^((n>>2)&1) XOR sign(x) */ + mulsd %xmm0, %xmm0 /* z=t^4 */ + + movsd MO1(DP_S4), %xmm2 /* S4 */ + mulsd %xmm0, %xmm2 /* z*S4 */ + movsd MO1(DP_S3), %xmm3 /* S3 */ + mulsd %xmm0, %xmm3 /* z*S3 */ + lea -8(%esp), %esp /* Borrow 4 bytes of stack frame */ + addsd MO1(DP_S2), %xmm2 /* S2+z*S4 */ + mulsd %xmm0, %xmm2 /* z*(S2+z*S4) */ + addsd MO1(DP_S1), %xmm3 /* S1+z*S3 */ + mulsd %xmm0, %xmm3 /* z*(S1+z*S3) */ + addsd MO1(DP_S0), %xmm2 /* S0+z*(S2+z*S4) */ + mulsd %xmm1, %xmm2 /* y*(S0+z*(S2+z*S4)) */ + /* t*s, where s = sign(x) * (-1.0)^((n>>2)&1) */ + mulsd MO2(DP_ONES,%ecx,8), %xmm4 + addsd %xmm2, %xmm3 /* y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + /* t*s*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + mulsd %xmm4, %xmm3 + /* t*s*(1.0+y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + addsd %xmm4, %xmm3 + movsd %xmm3, 0(%esp) /* Move result from sse... */ + fldl 0(%esp) /* ...to FPU. */ + /* Return back 4 bytes of stack frame */ + lea 8(%esp), %esp + RETURN + + .p2align 4 +L(large_args): + /* Here if |x|>=9*Pi/4 */ + cmpl $0x7f800000, %eax /* x is Inf or NaN? */ + jae L(arg_inf_or_nan) + + /* Here if finite |x|>=9*Pi/4 */ + cmpl $0x4b000000, %eax /* |x|<2^23? */ + jae L(very_large_args) + + /* Here if 9*Pi/4<=|x|<2^23 */ + movsd MO1(DP_INVPIO4), %xmm1 /* 1/(Pi/4) */ + mulsd %xmm0, %xmm1 /* |x|/(Pi/4) */ + cvttsd2si %xmm1, %eax /* k=trunc(|x|/(Pi/4)) */ + addl $1, %eax /* k+1 */ + movl %eax, %edx + andl $0xfffffffe, %edx /* j=(k+1)&0xfffffffe */ + cvtsi2sdl %edx, %xmm4 /* DP j */ + movl ARG_X, %ecx /* Load x */ + movsd MO1(DP_PIO4HI), %xmm2 /* -PIO4HI = high part of -Pi/4 */ + shrl $31, %ecx /* sign bit of x */ + mulsd %xmm4, %xmm2 /* -j*PIO4HI */ + movsd MO1(DP_PIO4LO), %xmm3 /* -PIO4LO = low part of -Pi/4 */ + addsd %xmm2, %xmm0 /* |x| - j*PIO4HI */ + mulsd %xmm3, %xmm4 /* j*PIO4LO */ + addsd %xmm4, %xmm0 /* t = |x| - j*PIO4HI - j*PIO4LO */ + jmp L(reconstruction) + + .p2align 4 +L(very_large_args): + /* Here if finite |x|>=2^23 */ + + /* bitpos = (ix>>23) - BIAS_32 + 59; */ + shrl $23, %eax /* eb = biased exponent of x */ + /* bitpos = eb - 0x7f + 59, where 0x7f is exponent bias */ + subl $68, %eax + movl $28, %ecx /* %cl=28 */ + movl %eax, %edx /* bitpos copy */ + + /* j = bitpos/28; */ + div %cl /* j in register %al=%ax/%cl */ + movapd %xmm0, %xmm3 /* |x| */ + /* clear unneeded remainder from %ah */ + andl $0xff, %eax + + imull $28, %eax, %ecx /* j*28 */ + movsd MO1(DP_HI_MASK), %xmm4 /* DP_HI_MASK */ + movapd %xmm0, %xmm5 /* |x| */ + mulsd -2*8+MO2(_FPI,%eax,8), %xmm3 /* tmp3 = FPI[j-2]*|x| */ + movapd %xmm0, %xmm1 /* |x| */ + mulsd -1*8+MO2(_FPI,%eax,8), %xmm5 /* tmp2 = FPI[j-1]*|x| */ + mulsd 0*8+MO2(_FPI,%eax,8), %xmm0 /* tmp0 = FPI[j]*|x| */ + addl $19, %ecx /* j*28+19 */ + mulsd 1*8+MO2(_FPI,%eax,8), %xmm1 /* tmp1 = FPI[j+1]*|x| */ + cmpl %ecx, %edx /* bitpos>=j*28+19? */ + jl L(very_large_skip1) + + /* Here if bitpos>=j*28+19 */ + andpd %xmm3, %xmm4 /* HI(tmp3) */ + subsd %xmm4, %xmm3 /* tmp3 = tmp3 - HI(tmp3) */ +L(very_large_skip1): + + movsd MO1(DP_2POW52), %xmm6 + movapd %xmm5, %xmm2 /* tmp2 copy */ + addsd %xmm3, %xmm5 /* tmp5 = tmp3 + tmp2 */ + movl $1, %edx + addsd %xmm5, %xmm6 /* tmp6 = tmp5 + 2^52 */ + movsd 8+MO1(DP_2POW52), %xmm4 + movd %xmm6, %eax /* k = I64_LO(tmp6); */ + addsd %xmm6, %xmm4 /* tmp4 = tmp6 - 2^52 */ + movl ARG_X, %ecx /* Load x */ + comisd %xmm5, %xmm4 /* tmp4 > tmp5? */ + jbe L(very_large_skip2) + + /* Here if tmp4 > tmp5 */ + subl $1, %eax /* k-- */ + addsd 8+MO1(DP_ONES), %xmm4 /* tmp4 -= 1.0 */ +L(very_large_skip2): + + andl %eax, %edx /* k&1 */ + subsd %xmm4, %xmm3 /* tmp3 -= tmp4 */ + addsd MO2(DP_ZERONE,%edx,8), %xmm3 /* t = DP_ZERONE[k&1] + tmp3 */ + addsd %xmm2, %xmm3 /* t += tmp2 */ + shrl $31, %ecx /* sign of x */ + addsd %xmm3, %xmm0 /* t += tmp0 */ + addl $1, %eax /* n=k+1 */ + addsd %xmm1, %xmm0 /* t += tmp1 */ + mulsd MO1(DP_PIO4), %xmm0 /* t *= PI04 */ + + jmp L(reconstruction) /* end of very_large_args peth */ + + .p2align 4 +L(arg_less_pio4): + /* Here if |x|<Pi/4 */ + cmpl $0x3d000000, %eax /* |x|<2^-5? */ + jl L(arg_less_2pn5) + + /* Here if 2^-5<=|x|<Pi/4 */ + movaps %xmm0, %xmm3 /* x */ + mulsd %xmm0, %xmm0 /* y=x^2 */ + movaps %xmm0, %xmm1 /* y */ + mulsd %xmm0, %xmm0 /* z=x^4 */ + movsd MO1(DP_S4), %xmm4 /* S4 */ + mulsd %xmm0, %xmm4 /* z*S4 */ + movsd MO1(DP_S3), %xmm5 /* S3 */ + mulsd %xmm0, %xmm5 /* z*S3 */ + addsd MO1(DP_S2), %xmm4 /* S2+z*S4 */ + mulsd %xmm0, %xmm4 /* z*(S2+z*S4) */ + addsd MO1(DP_S1), %xmm5 /* S1+z*S3 */ + mulsd %xmm0, %xmm5 /* z*(S1+z*S3) */ + addsd MO1(DP_S0), %xmm4 /* S0+z*(S2+z*S4) */ + mulsd %xmm1, %xmm4 /* y*(S0+z*(S2+z*S4)) */ + mulsd %xmm3, %xmm5 /* x*z*(S1+z*S3) */ + mulsd %xmm3, %xmm4 /* x*y*(S0+z*(S2+z*S4)) */ + /* x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + addsd %xmm5, %xmm4 + /* x + x*y*(S0+y*(S1+y*(S2+y*(S3+y*S4)))) */ + addsd %xmm4, %xmm3 + cvtsd2ss %xmm3, %xmm3 /* SP result */ + +L(epilogue): + lea -4(%esp), %esp /* Borrow 4 bytes of stack frame */ + movss %xmm3, 0(%esp) /* Move result from sse... */ + flds 0(%esp) /* ...to FPU. */ + /* Return back 4 bytes of stack frame */ + lea 4(%esp), %esp + RETURN + + .p2align 4 +L(arg_less_2pn5): + /* Here if |x|<2^-5 */ + cmpl $0x32000000, %eax /* |x|<2^-27? */ + jl L(arg_less_2pn27) + + /* Here if 2^-27<=|x|<2^-5 */ + movaps %xmm0, %xmm1 /* DP x */ + mulsd %xmm0, %xmm0 /* DP x^2 */ + movsd MO1(DP_SIN2_1), %xmm3 /* DP DP_SIN2_1 */ + mulsd %xmm0, %xmm3 /* DP x^2*DP_SIN2_1 */ + addsd MO1(DP_SIN2_0), %xmm3 /* DP DP_SIN2_0+x^2*DP_SIN2_1 */ + mulsd %xmm0, %xmm3 /* DP x^2*DP_SIN2_0+x^4*DP_SIN2_1 */ + mulsd %xmm1, %xmm3 /* DP x^3*DP_SIN2_0+x^5*DP_SIN2_1 */ + addsd %xmm1, %xmm3 /* DP x+x^3*DP_SIN2_0+x^5*DP_SIN2_1 */ + cvtsd2ss %xmm3, %xmm3 /* SP result */ + jmp L(epilogue) + + .p2align 4 +L(arg_less_2pn27): + movss ARG_X, %xmm3 /* SP x */ + cmpl $0, %eax /* x=0? */ + je L(epilogue) /* in case x=0 return sin(+-0)==+-0 */ + /* Here if |x|<2^-27 */ + /* + * Special cases here: + * sin(subnormal) raises inexact/underflow + * sin(min_normalized) raises inexact/underflow + * sin(normalized) raises inexact + */ + movaps %xmm0, %xmm3 /* Copy of DP x */ + mulsd MO1(DP_SMALL), %xmm0 /* x*DP_SMALL */ + subsd %xmm0, %xmm3 /* Result is x-x*DP_SMALL */ + cvtsd2ss %xmm3, %xmm3 /* Result converted to SP */ + jmp L(epilogue) + + .p2align 4 +L(arg_inf_or_nan): + /* Here if |x| is Inf or NAN */ + jne L(skip_errno_setting) /* in case of x is NaN */ + + /* Here if x is Inf. Set errno to EDOM. */ + call JUMPTARGET(__errno_location) + movl $EDOM, (%eax) + + .p2align 4 +L(skip_errno_setting): + /* Here if |x| is Inf or NAN. Continued. */ + movss ARG_X, %xmm3 /* load x */ + subss %xmm3, %xmm3 /* Result is NaN */ + jmp L(epilogue) +END(__sinf_sse2) + + .section .rodata, "a" + .p2align 3 +L(PIO4J): /* Table of j*Pi/4, for j=0,1,..,10 */ + .long 0x00000000,0x00000000 + .long 0x54442d18,0x3fe921fb + .long 0x54442d18,0x3ff921fb + .long 0x7f3321d2,0x4002d97c + .long 0x54442d18,0x400921fb + .long 0x2955385e,0x400f6a7a + .long 0x7f3321d2,0x4012d97c + .long 0xe9bba775,0x4015fdbb + .long 0x54442d18,0x401921fb + .long 0xbeccb2bb,0x401c463a + .long 0x2955385e,0x401f6a7a + .type L(PIO4J), @object + ASM_SIZE_DIRECTIVE(L(PIO4J)) + + .p2align 3 +L(_FPI): /* 4/Pi broken into sum of positive DP values */ + .long 0x00000000,0x00000000 + .long 0x6c000000,0x3ff45f30 + .long 0x2a000000,0x3e3c9c88 + .long 0xa8000000,0x3c54fe13 + .long 0xd0000000,0x3aaf47d4 + .long 0x6c000000,0x38fbb81b + .long 0xe0000000,0x3714acc9 + .long 0x7c000000,0x3560e410 + .long 0x56000000,0x33bca2c7 + .long 0xac000000,0x31fbd778 + .long 0xe0000000,0x300b7246 + .long 0xe8000000,0x2e5d2126 + .long 0x48000000,0x2c970032 + .long 0xe8000000,0x2ad77504 + .long 0xe0000000,0x290921cf + .long 0xb0000000,0x274deb1c + .long 0xe0000000,0x25829a73 + .long 0xbe000000,0x23fd1046 + .long 0x10000000,0x2224baed + .long 0x8e000000,0x20709d33 + .long 0x80000000,0x1e535a2f + .long 0x64000000,0x1cef904e + .long 0x30000000,0x1b0d6398 + .long 0x24000000,0x1964ce7d + .long 0x16000000,0x17b908bf + .type L(_FPI), @object + ASM_SIZE_DIRECTIVE(L(_FPI)) + +/* Coefficients of polynomial + for sin(x)~=x+x^3*DP_SIN2_0+x^5*DP_SIN2_1, |x|<2^-5. */ + .p2align 3 +L(DP_SIN2_0): + .long 0x5543d49d,0xbfc55555 + .type L(DP_SIN2_0), @object + ASM_SIZE_DIRECTIVE(L(DP_SIN2_0)) + + .p2align 3 +L(DP_SIN2_1): + .long 0x75cec8c5,0x3f8110f4 + .type L(DP_SIN2_1), @object + ASM_SIZE_DIRECTIVE(L(DP_SIN2_1)) + + .p2align 3 +L(DP_ZERONE): + .long 0x00000000,0x00000000 /* 0.0 */ + .long 0x00000000,0xbff00000 /* 1.0 */ + .type L(DP_ZERONE), @object + ASM_SIZE_DIRECTIVE(L(DP_ZERONE)) + + .p2align 3 +L(DP_ONES): + .long 0x00000000,0x3ff00000 /* +1.0 */ + .long 0x00000000,0xbff00000 /* -1.0 */ + .type L(DP_ONES), @object + ASM_SIZE_DIRECTIVE(L(DP_ONES)) + +/* Coefficients of polynomial + for sin(t)~=t+t^3*(S0+t^2*(S1+t^2*(S2+t^2*(S3+t^2*S4)))), |t|<Pi/4. */ + .p2align 3 +L(DP_S3): + .long 0x64e6b5b4,0x3ec71d72 + .type L(DP_S3), @object + ASM_SIZE_DIRECTIVE(L(DP_S3)) + + .p2align 3 +L(DP_S1): + .long 0x10c2688b,0x3f811111 + .type L(DP_S1), @object + ASM_SIZE_DIRECTIVE(L(DP_S1)) + + .p2align 3 +L(DP_S4): + .long 0x1674b58a,0xbe5a947e + .type L(DP_S4), @object + ASM_SIZE_DIRECTIVE(L(DP_S4)) + + .p2align 3 +L(DP_S2): + .long 0x8b4bd1f9,0xbf2a019f + .type L(DP_S2), @object + ASM_SIZE_DIRECTIVE(L(DP_S2)) + + .p2align 3 +L(DP_S0): + .long 0x55551cd9,0xbfc55555 + .type L(DP_S0), @object + ASM_SIZE_DIRECTIVE(L(DP_S0)) + + .p2align 3 +L(DP_SMALL): + .long 0x00000000,0x3cd00000 /* 2^(-50) */ + .type L(DP_SMALL), @object + ASM_SIZE_DIRECTIVE(L(DP_SMALL)) + +/* Coefficients of polynomial + for cos(t)~=1.0+t^2*(C0+t^2*(C1+t^2*(C2+t^2*(C3+t^2*C4)))), |t|<Pi/4. */ + .p2align 3 +L(DP_C3): + .long 0x9ac43cc0,0x3efa00eb + .type L(DP_C3), @object + ASM_SIZE_DIRECTIVE(L(DP_C3)) + + .p2align 3 +L(DP_C1): + .long 0x545c50c7,0x3fa55555 + .type L(DP_C1), @object + ASM_SIZE_DIRECTIVE(L(DP_C1)) + + .p2align 3 +L(DP_C4): + .long 0xdd8844d7,0xbe923c97 + .type L(DP_C4), @object + ASM_SIZE_DIRECTIVE(L(DP_C4)) + + .p2align 3 +L(DP_C2): + .long 0x348b6874,0xbf56c16b + .type L(DP_C2), @object + ASM_SIZE_DIRECTIVE(L(DP_C2)) + + .p2align 3 +L(DP_C0): + .long 0xfffe98ae,0xbfdfffff + .type L(DP_C0), @object + ASM_SIZE_DIRECTIVE(L(DP_C0)) + + .p2align 3 +L(DP_PIO4): + .long 0x54442d18,0x3fe921fb /* Pi/4 */ + .type L(DP_PIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4)) + + .p2align 3 +L(DP_2POW52): + .long 0x00000000,0x43300000 /* +2^52 */ + .long 0x00000000,0xc3300000 /* -2^52 */ + .type L(DP_2POW52), @object + ASM_SIZE_DIRECTIVE(L(DP_2POW52)) + + .p2align 3 +L(DP_INVPIO4): + .long 0x6dc9c883,0x3ff45f30 /* 4/Pi */ + .type L(DP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(DP_INVPIO4)) + + .p2align 3 +L(DP_PIO4HI): + .long 0x54000000,0xbfe921fb /* High part of Pi/4 */ + .type L(DP_PIO4HI), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4HI)) + + .p2align 3 +L(DP_PIO4LO): + .long 0x11A62633,0xbe010b46 /* Low part of Pi/4 */ + .type L(DP_PIO4LO), @object + ASM_SIZE_DIRECTIVE(L(DP_PIO4LO)) + + .p2align 2 +L(SP_INVPIO4): + .long 0x3fa2f983 /* 4/Pi */ + .type L(SP_INVPIO4), @object + ASM_SIZE_DIRECTIVE(L(SP_INVPIO4)) + + .p2align 4 +L(DP_ABS_MASK): /* Mask for getting DP absolute value */ + .long 0xffffffff,0x7fffffff + .long 0xffffffff,0x7fffffff + .type L(DP_ABS_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_ABS_MASK)) + + .p2align 3 +L(DP_HI_MASK): /* Mask for getting high 21 bits of DP value */ + .long 0x00000000,0xffffffff + .type L(DP_HI_MASK), @object + ASM_SIZE_DIRECTIVE(L(DP_HI_MASK)) + +weak_alias (__sinf, sinf) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c new file mode 100644 index 0000000000..8ccdd2f34d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/multiarch/s_sinf.c @@ -0,0 +1,28 @@ +/* Multiple versions of sinf + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <init-arch.h> + +extern float __sinf_sse2 (float); +extern float __sinf_ia32 (float); +float __sinf (float); + +libm_ifunc (__sinf, HAS_CPU_FEATURE (SSE2) ? __sinf_sse2 : __sinf_ia32); +weak_alias (__sinf, sinf); +#define SINF __sinf_ia32 +#include <sysdeps/ieee754/flt-32/s_sinf.c> diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S new file mode 100644 index 0000000000..ace8db9410 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmax.S @@ -0,0 +1,39 @@ +/* Compute maximum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmax) + fldl 4(%esp) // x + fldl 12(%esp) // x : y + + fucomi %st(0), %st + fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise + + fxch + + fucomi %st(1), %st + fcmovb %st(1), %st + + fstp %st(1) + + ret +END(__fmax) +weak_alias (__fmax, fmax) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S new file mode 100644 index 0000000000..3a25951a09 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxf.S @@ -0,0 +1,39 @@ +/* Compute maximum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmaxf) + flds 4(%esp) // x + flds 8(%esp) // x : y + + fucomi %st(0), %st + fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise + + fxch + + fucomi %st(1), %st + fcmovb %st(1), %st + + fstp %st(1) + + ret +END(__fmaxf) +weak_alias (__fmaxf, fmaxf) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S new file mode 100644 index 0000000000..3f6c21c63d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmaxl.S @@ -0,0 +1,58 @@ +/* Compute maximum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmaxl) + fldt 4(%esp) // x + fldt 16(%esp) // x : y + + fucomi %st(1), %st + jp 2f + fcmovb %st(1), %st + + fstp %st(1) + + ret + +2: // Unordered. + fucomi %st(0), %st + jp 3f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 11(%esp) + jz 4f + fstp %st(1) + ret + +3: // st(0) is a NaN; st(1) may or may not be. + fxch + fucomi %st(0), %st + jp 4f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 23(%esp) + jz 4f + fstp %st(1) + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + faddp + ret +END(__fmaxl) +weak_alias (__fmaxl, fmaxl) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S new file mode 100644 index 0000000000..72d306fd79 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fmin.S @@ -0,0 +1,37 @@ +/* Compute minimum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fmin) + fldl 4(%esp) // x + fldl 12(%esp) // x : y + + fucomi %st(0), %st + fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise + + fucomi %st(1), %st + fcmovnb %st(1), %st + + fstp %st(1) + + ret +END(__fmin) +weak_alias (__fmin, fmin) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S new file mode 100644 index 0000000000..52ea892bad --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminf.S @@ -0,0 +1,37 @@ +/* Compute minimum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fminf) + flds 4(%esp) // x + flds 8(%esp) // x : y + + fucomi %st(0), %st + fcmovu %st(1), %st // now %st contains y if not NaN, x otherwise + + fucomi %st(1), %st + fcmovnb %st(1), %st + + fstp %st(1) + + ret +END(__fminf) +weak_alias (__fminf, fminf) diff --git a/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S new file mode 100644 index 0000000000..e1cb83fed7 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/fpu/s_fminl.S @@ -0,0 +1,58 @@ +/* Compute minimum of two numbers, regarding NaN as missing argument. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + + .text +ENTRY(__fminl) + fldt 4(%esp) // x + fldt 16(%esp) // x : y + + fucomi %st(1), %st + jp 2f + fcmovnb %st(1), %st + + fstp %st(1) + + ret + +2: // Unordered. + fucomi %st(0), %st + jp 3f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 11(%esp) + jz 4f + fstp %st(1) + ret + +3: // st(0) is a NaN; st(1) may or may not be. + fxch + fucomi %st(0), %st + jp 4f + // st(1) is a NaN; st(0) is not. Test if st(1) is signaling. + testb $0x40, 23(%esp) + jz 4f + fstp %st(1) + ret + +4: // Both arguments are NaNs, or one is a signaling NaN. + faddp + ret +END(__fminl) +weak_alias (__fminl, fminl) diff --git a/REORG.TODO/sysdeps/i386/i686/hp-timing.h b/REORG.TODO/sysdeps/i386/i686/hp-timing.h new file mode 100644 index 0000000000..1b11410feb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/hp-timing.h @@ -0,0 +1,42 @@ +/* High precision, low overhead timing functions. i686 version. + Copyright (C) 1998-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _HP_TIMING_H +#define _HP_TIMING_H 1 + +/* We always assume having the timestamp register. */ +#define HP_TIMING_AVAIL (1) +#define HP_SMALL_TIMING_AVAIL (1) + +/* We indeed have inlined functions. */ +#define HP_TIMING_INLINE (1) + +/* We use 64bit values for the times. */ +typedef unsigned long long int hp_timing_t; + +/* That's quite simple. Use the `rdtsc' instruction. Note that the value + might not be 100% accurate since there might be some more instructions + running in this moment. This could be changed by using a barrier like + 'cpuid' right before the `rdtsc' instruciton. But we are not interested + in accurate clock cycles here so we don't do this. */ +#define HP_TIMING_NOW(Var) __asm__ __volatile__ ("rdtsc" : "=A" (Var)) + +#include <hp-timing-common.h> + +#endif /* hp-timing.h */ diff --git a/REORG.TODO/sysdeps/i386/i686/init-arch.h b/REORG.TODO/sysdeps/i386/i686/init-arch.h new file mode 100644 index 0000000000..f55f80efa0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/init-arch.h @@ -0,0 +1,19 @@ +/* Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define MINIMUM_ISA 686 +#include <sysdeps/x86/init-arch.h> diff --git a/REORG.TODO/sysdeps/i386/i686/memcmp.S b/REORG.TODO/sysdeps/i386/i686/memcmp.S new file mode 100644 index 0000000000..5140ee2145 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/memcmp.S @@ -0,0 +1,408 @@ +/* Compare two memory blocks for differences in the first COUNT bytes. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* Preserve EBX. */ +#define BLK1 PARMS +#define BLK2 BLK1+4 +#define LEN BLK2+4 +#define ENTRANCE pushl %ebx; cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (ebx, 0) +#define RETURN popl %ebx; cfi_adjust_cfa_offset (-4); \ + cfi_restore (ebx); ret + +/* Load an entry in a jump table into EBX. TABLE is a jump table + with relative offsets. INDEX is a register contains the index + into the jump table. */ +#define LOAD_JUMP_TABLE_ENTRY(TABLE, INDEX) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,4), %ebx + + .text + ALIGN (4) +ENTRY (memcmp) + ENTRANCE + + movl BLK1(%esp), %eax + movl BLK2(%esp), %edx + movl LEN(%esp), %ecx + + cmpl $1, %ecx + jne L(not_1) + movzbl (%eax), %ecx /* LEN == 1 */ + cmpb (%edx), %cl + jne L(neq) +L(bye): + xorl %eax, %eax + RETURN + + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) +L(neq): + sbbl %eax, %eax + sbbl $-1, %eax + RETURN + + cfi_adjust_cfa_offset (4) + cfi_rel_offset (ebx, 0) +L(not_1): + jl L(bye) /* LEN == 0 */ + + pushl %esi + cfi_adjust_cfa_offset (4) + movl %eax, %esi + cfi_rel_offset (esi, 0) + cmpl $32, %ecx; + jge L(32bytesormore) /* LEN => 32 */ + + LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx) + addl %ecx, %edx + addl %ecx, %esi + jmp *%ebx + + ALIGN (4) +L(28bytes): + movl -28(%esi), %eax + movl -28(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(24bytes): + movl -24(%esi), %eax + movl -24(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(20bytes): + movl -20(%esi), %eax + movl -20(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(16bytes): + movl -16(%esi), %eax + movl -16(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(12bytes): + movl -12(%esi), %eax + movl -12(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(8bytes): + movl -8(%esi), %eax + movl -8(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(4bytes): + movl -4(%esi), %eax + movl -4(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(0bytes): + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + xorl %eax, %eax + RETURN + + cfi_adjust_cfa_offset (8) + cfi_rel_offset (esi, 0) + cfi_rel_offset (ebx, 4) +L(29bytes): + movl -29(%esi), %eax + movl -29(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(25bytes): + movl -25(%esi), %eax + movl -25(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(21bytes): + movl -21(%esi), %eax + movl -21(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(17bytes): + movl -17(%esi), %eax + movl -17(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(13bytes): + movl -13(%esi), %eax + movl -13(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(9bytes): + movl -9(%esi), %eax + movl -9(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(5bytes): + movl -5(%esi), %eax + movl -5(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(1bytes): + movzbl -1(%esi), %eax + cmpb -1(%edx), %al + jne L(set) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + xorl %eax, %eax + RETURN + + cfi_adjust_cfa_offset (8) + cfi_rel_offset (esi, 0) + cfi_rel_offset (ebx, 4) +L(30bytes): + movl -30(%esi), %eax + movl -30(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(26bytes): + movl -26(%esi), %eax + movl -26(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(22bytes): + movl -22(%esi), %eax + movl -22(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(18bytes): + movl -18(%esi), %eax + movl -18(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(14bytes): + movl -14(%esi), %eax + movl -14(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(10bytes): + movl -10(%esi), %eax + movl -10(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(6bytes): + movl -6(%esi), %eax + movl -6(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(2bytes): + movzwl -2(%esi), %eax + movzwl -2(%edx), %ecx + cmpb %cl, %al + jne L(set) + cmpl %ecx, %eax + jne L(set) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + xorl %eax, %eax + RETURN + + cfi_adjust_cfa_offset (8) + cfi_rel_offset (esi, 0) + cfi_rel_offset (ebx, 4) +L(31bytes): + movl -31(%esi), %eax + movl -31(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(27bytes): + movl -27(%esi), %eax + movl -27(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(23bytes): + movl -23(%esi), %eax + movl -23(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(19bytes): + movl -19(%esi), %eax + movl -19(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(15bytes): + movl -15(%esi), %eax + movl -15(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(11bytes): + movl -11(%esi), %eax + movl -11(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(7bytes): + movl -7(%esi), %eax + movl -7(%edx), %ecx + cmpl %ecx, %eax + jne L(find_diff) +L(3bytes): + movzwl -3(%esi), %eax + movzwl -3(%edx), %ecx + cmpb %cl, %al + jne L(set) + cmpl %ecx, %eax + jne L(set) + movzbl -1(%esi), %eax + cmpb -1(%edx), %al + jne L(set) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + xorl %eax, %eax + RETURN + + cfi_adjust_cfa_offset (8) + cfi_rel_offset (esi, 0) + cfi_rel_offset (ebx, 4) + ALIGN (4) +/* ECX >= 32. */ +L(32bytesormore): + subl $32, %ecx + + movl (%esi), %eax + cmpl (%edx), %eax + jne L(load_ecx) + + movl 4(%esi), %eax + cmpl 4(%edx), %eax + jne L(load_ecx_4) + + movl 8(%esi), %eax + cmpl 8(%edx), %eax + jne L(load_ecx_8) + + movl 12(%esi), %eax + cmpl 12(%edx), %eax + jne L(load_ecx_12) + + movl 16(%esi), %eax + cmpl 16(%edx), %eax + jne L(load_ecx_16) + + movl 20(%esi), %eax + cmpl 20(%edx), %eax + jne L(load_ecx_20) + + movl 24(%esi), %eax + cmpl 24(%edx), %eax + jne L(load_ecx_24) + + movl 28(%esi), %eax + cmpl 28(%edx), %eax + jne L(load_ecx_28) + + addl $32, %esi + addl $32, %edx + cmpl $32, %ecx + jge L(32bytesormore) + + LOAD_JUMP_TABLE_ENTRY (L(table_32bytes), %ecx) + addl %ecx, %edx + addl %ecx, %esi + jmp *%ebx + +L(load_ecx_28): + addl $0x4, %edx +L(load_ecx_24): + addl $0x4, %edx +L(load_ecx_20): + addl $0x4, %edx +L(load_ecx_16): + addl $0x4, %edx +L(load_ecx_12): + addl $0x4, %edx +L(load_ecx_8): + addl $0x4, %edx +L(load_ecx_4): + addl $0x4, %edx +L(load_ecx): + movl (%edx), %ecx + +L(find_diff): + cmpb %cl, %al + jne L(set) + cmpb %ch, %ah + jne L(set) + shrl $16,%eax + shrl $16,%ecx + cmpb %cl, %al + jne L(set) + /* We get there only if we already know there is a + difference. */ + cmpl %ecx, %eax +L(set): + sbbl %eax, %eax + sbbl $-1, %eax + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + RETURN +END (memcmp) + + .section .rodata + ALIGN (2) +L(table_32bytes) : + .long L(0bytes) - L(table_32bytes) + .long L(1bytes) - L(table_32bytes) + .long L(2bytes) - L(table_32bytes) + .long L(3bytes) - L(table_32bytes) + .long L(4bytes) - L(table_32bytes) + .long L(5bytes) - L(table_32bytes) + .long L(6bytes) - L(table_32bytes) + .long L(7bytes) - L(table_32bytes) + .long L(8bytes) - L(table_32bytes) + .long L(9bytes) - L(table_32bytes) + .long L(10bytes) - L(table_32bytes) + .long L(11bytes) - L(table_32bytes) + .long L(12bytes) - L(table_32bytes) + .long L(13bytes) - L(table_32bytes) + .long L(14bytes) - L(table_32bytes) + .long L(15bytes) - L(table_32bytes) + .long L(16bytes) - L(table_32bytes) + .long L(17bytes) - L(table_32bytes) + .long L(18bytes) - L(table_32bytes) + .long L(19bytes) - L(table_32bytes) + .long L(20bytes) - L(table_32bytes) + .long L(21bytes) - L(table_32bytes) + .long L(22bytes) - L(table_32bytes) + .long L(23bytes) - L(table_32bytes) + .long L(24bytes) - L(table_32bytes) + .long L(25bytes) - L(table_32bytes) + .long L(26bytes) - L(table_32bytes) + .long L(27bytes) - L(table_32bytes) + .long L(28bytes) - L(table_32bytes) + .long L(29bytes) - L(table_32bytes) + .long L(30bytes) - L(table_32bytes) + .long L(31bytes) - L(table_32bytes) + + +#undef bcmp +weak_alias (memcmp, bcmp) +libc_hidden_builtin_def (memcmp) diff --git a/REORG.TODO/sysdeps/i386/i686/memcpy.S b/REORG.TODO/sysdeps/i386/i686/memcpy.S new file mode 100644 index 0000000000..1d61447430 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/memcpy.S @@ -0,0 +1,98 @@ +/* Copy memory block and return pointer to beginning of destination block + For Intel 80x86, x>=6. + This file is part of the GNU C Library. + Copyright (C) 1999-2017 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4 /* no space for saved regs */ +#define RTN PARMS +#define DEST RTN +#define SRC DEST+4 +#define LEN SRC+4 + + .text +#if defined PIC && IS_IN (libc) +ENTRY_CHK (__memcpy_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (__memcpy_chk) +#endif +ENTRY (memcpy) + + movl %edi, %eax + movl DEST(%esp), %edi + movl %esi, %edx + movl SRC(%esp), %esi + + movl %edi, %ecx + xorl %esi, %ecx + andl $3, %ecx + movl LEN(%esp), %ecx + cld + jne .Lunaligned + + cmpl $3, %ecx + jbe .Lunaligned + + testl $3, %esi + je 1f + movsb + decl %ecx + testl $3, %esi + je 1f + movsb + decl %ecx + testl $3, %esi + je 1f + movsb + decl %ecx +1: pushl %eax + movl %ecx, %eax + shrl $2, %ecx + andl $3, %eax + rep + movsl + movl %eax, %ecx + rep + movsb + popl %eax + +.Lend: movl %eax, %edi + movl %edx, %esi + movl DEST(%esp), %eax + + ret + + /* When we come here the pointers do not have the same + alignment or the length is too short. No need to optimize for + aligned memory accesses. */ +.Lunaligned: + shrl $1, %ecx + jnc 1f + movsb +1: shrl $1, %ecx + jnc 2f + movsw +2: rep + movsl + jmp .Lend +END (memcpy) +libc_hidden_builtin_def (memcpy) diff --git a/REORG.TODO/sysdeps/i386/i686/memmove.S b/REORG.TODO/sysdeps/i386/i686/memmove.S new file mode 100644 index 0000000000..f60c3d501b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/memmove.S @@ -0,0 +1,120 @@ +/* Copy memory block and return pointer to beginning of destination block + For Intel 80x86, x>=6. + This file is part of the GNU C Library. + Copyright (C) 2003-2017 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 2003. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* one spilled register */ +#define RTN PARMS + + .text + +#ifdef USE_AS_BCOPY +# define SRC RTN +# define DEST SRC+4 +# define LEN DEST+4 +#else +# define DEST RTN +# define SRC DEST+4 +# define LEN SRC+4 + +# if defined PIC && IS_IN (libc) +ENTRY_CHK (__memmove_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (__memmove_chk) +# endif +#endif + +ENTRY (memmove) + + pushl %edi + cfi_adjust_cfa_offset (4) + + movl LEN(%esp), %ecx + movl DEST(%esp), %edi + cfi_rel_offset (edi, 0) + movl %esi, %edx + movl SRC(%esp), %esi + cfi_register (esi, edx) + + movl %edi, %eax + subl %esi, %eax + cmpl %eax, %ecx + ja 3f + + cld + shrl $1, %ecx + jnc 1f + movsb +1: shrl $1, %ecx + jnc 2f + movsw +2: rep + movsl + movl %edx, %esi + cfi_restore (esi) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +#endif + + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret + + cfi_adjust_cfa_offset (4) + cfi_rel_offset (edi, 0) + cfi_register (esi, edx) + + /* Backward copying. */ +3: std + leal -1(%edi, %ecx), %edi + leal -1(%esi, %ecx), %esi + shrl $1, %ecx + jnc 1f + movsb +1: subl $1, %edi + subl $1, %esi + shrl $1, %ecx + jnc 2f + movsw +2: subl $2, %edi + subl $2, %esi + rep + movsl + movl %edx, %esi + cfi_restore (esi) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +#endif + + cld + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (memmove) +#ifndef USE_AS_BCOPY +libc_hidden_builtin_def (memmove) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/mempcpy.S new file mode 100644 index 0000000000..31cb4efdb2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/mempcpy.S @@ -0,0 +1,65 @@ +/* Copy memory block and return pointer to following byte. + For Intel 80x86, x>=6. + This file is part of the GNU C Library. + Copyright (C) 1998-2017 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4 /* no space for saved regs */ +#define RTN PARMS +#define DEST RTN +#define SRC DEST+4 +#define LEN SRC+4 + + .text +#if defined PIC && IS_IN (libc) +ENTRY_CHK (__mempcpy_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (__mempcpy_chk) +#endif +ENTRY (__mempcpy) + + movl LEN(%esp), %ecx + movl %edi, %eax + cfi_register (edi, eax) + movl DEST(%esp), %edi + movl %esi, %edx + cfi_register (esi, edx) + movl SRC(%esp), %esi + cld + shrl $1, %ecx + jnc 1f + movsb +1: shrl $1, %ecx + jnc 2f + movsw +2: rep + movsl + xchgl %edi, %eax + cfi_restore (edi) + movl %edx, %esi + cfi_restore (esi) + + ret +END (__mempcpy) +libc_hidden_def (__mempcpy) +weak_alias (__mempcpy, mempcpy) +libc_hidden_builtin_def (mempcpy) diff --git a/REORG.TODO/sysdeps/i386/i686/memset.S b/REORG.TODO/sysdeps/i386/i686/memset.S new file mode 100644 index 0000000000..24d06178d2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/memset.S @@ -0,0 +1,100 @@ +/* memset/bzero -- set memory area to CH/0 + Highly optimized version for ix86, x>=6. + Copyright (C) 1999-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* space for 1 saved reg */ +#ifdef USE_AS_BZERO +# define DEST PARMS +# define LEN DEST+4 +#else +# define RTN PARMS +# define DEST RTN +# define CHR DEST+4 +# define LEN CHR+4 +#endif + + .text +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO +ENTRY_CHK (__memset_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END_CHK (__memset_chk) +#endif +ENTRY (memset) + + cld + pushl %edi + cfi_adjust_cfa_offset (4) + movl DEST(%esp), %edx + movl LEN(%esp), %ecx +#ifdef USE_AS_BZERO + xorl %eax, %eax /* fill with 0 */ +#else + movzbl CHR(%esp), %eax +#endif + jecxz 1f + movl %edx, %edi + cfi_rel_offset (edi, 0) + andl $3, %edx + jz 2f /* aligned */ + jp 3f /* misaligned at 3, store just one byte below */ + stosb /* misaligned at 1 or 2, store two bytes */ + decl %ecx + jz 1f +3: stosb + decl %ecx + jz 1f + xorl $1, %edx + jnz 2f /* was misaligned at 2 or 3, now aligned */ + stosb /* was misaligned at 1, store third byte */ + decl %ecx +2: movl %ecx, %edx + shrl $2, %ecx + andl $3, %edx +#ifndef USE_AS_BZERO + imul $0x01010101, %eax +#endif + rep + stosl + movl %edx, %ecx + rep + stosb + +1: +#ifndef USE_AS_BZERO + movl DEST(%esp), %eax /* start address of destination is result */ +#endif + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (memset) +libc_hidden_builtin_def (memset) + +#if defined SHARED && IS_IN (libc) && !defined __memset_chk \ + && !defined USE_AS_BZERO +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/memusage.h b/REORG.TODO/sysdeps/i386/i686/memusage.h new file mode 100644 index 0000000000..77a020d7c0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/memusage.h @@ -0,0 +1,21 @@ +/* Copyright (C) 2000-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define GETSP() ({ register uintptr_t stack_ptr asm ("esp"); stack_ptr; }) +#define GETTIME(low,high) asm ("rdtsc" : "=a" (low), "=d" (high)) + +#include <sysdeps/generic/memusage.h> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile new file mode 100644 index 0000000000..4a0c20c051 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/Makefile @@ -0,0 +1,44 @@ +ifeq ($(subdir),csu) +tests += test-multiarch +endif + +ifeq ($(subdir),string) +gen-as-const-headers += locale-defines.sym +sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ + memmove-ssse3 memcpy-ssse3-rep mempcpy-ssse3-rep \ + memmove-ssse3-rep bcopy-ssse3 bcopy-ssse3-rep \ + memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \ + strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ + memcmp-ssse3 memcmp-sse4 varshift \ + strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \ + strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \ + strncpy-sse2 stpcpy-sse2 stpncpy-sse2 strcat-ssse3 \ + strcat-sse2 strncat-ssse3 strncat-sse2 strncat-c \ + strchr-sse2 strrchr-sse2 strchr-sse2-bsf strrchr-sse2-bsf \ + memchr-sse2 memchr-sse2-bsf \ + memrchr-sse2 memrchr-sse2-bsf memrchr-c \ + rawmemchr-sse2 rawmemchr-sse2-bsf \ + strnlen-sse2 strnlen-c \ + strcasecmp_l-c strcasecmp-c strcasecmp_l-ssse3 \ + strncase_l-c strncase-c strncase_l-ssse3 \ + strcasecmp_l-sse4 strncase_l-sse4 \ + bcopy-sse2-unaligned memcpy-sse2-unaligned \ + mempcpy-sse2-unaligned memmove-sse2-unaligned \ + strcspn-c strpbrk-c strspn-c +CFLAGS-varshift.c += -msse4 +CFLAGS-strcspn-c.c += -msse4 +CFLAGS-strpbrk-c.c += -msse4 +CFLAGS-strspn-c.c += -msse4 +endif + +ifeq ($(subdir),wcsmbs) +sysdep_routines += wcscmp-sse2 wcscmp-c wcslen-sse2 wcslen-c \ + wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcschr-sse2 \ + wcschr-c wcsrchr-sse2 wcsrchr-c wcscpy-ssse3 wcscpy-c +endif + +ifeq ($(subdir),math) +libm-sysdep_routines += s_fma-fma s_fmaf-fma +CFLAGS-s_fma-fma.c += -mavx -mfpmath=sse +CFLAGS-s_fmaf-fma.c += -mavx -mfpmath=sse +endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S new file mode 100644 index 0000000000..efef2a10dd --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S new file mode 100644 index 0000000000..cbc8b420e8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S new file mode 100644 index 0000000000..36aac44b9c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define USE_AS_BCOPY +#define MEMCPY __bcopy_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S new file mode 100644 index 0000000000..877f82c28f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bcopy.S @@ -0,0 +1,59 @@ +/* Multiple versions of bcopy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(bcopy) + .type bcopy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__bcopy_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__bcopy_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep) +2: ret +END(bcopy) + +# undef ENTRY +# define ENTRY(name) \ + .type __bcopy_ia32, @function; \ + .p2align 4; \ + .globl __bcopy_ia32; \ + .hidden __bcopy_ia32; \ + __bcopy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __bcopy_ia32, .-__bcopy_ia32 + +#endif + +#include "../bcopy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S new file mode 100644 index 0000000000..507b288bb3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2-rep.S @@ -0,0 +1,3 @@ +#define USE_AS_BZERO +#define __memset_sse2_rep __bzero_sse2_rep +#include "memset-sse2-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S new file mode 100644 index 0000000000..8d04512e4e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_BZERO +#define __memset_sse2 __bzero_sse2 +#include "memset-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S new file mode 100644 index 0000000000..9dac490aa2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/bzero.S @@ -0,0 +1,62 @@ +/* Multiple versions of bzero + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(__bzero) + .type __bzero, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__bzero_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX ( __bzero_sse2) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__bzero_sse2_rep) +2: ret +END(__bzero) + +# undef ENTRY +# define ENTRY(name) \ + .type __bzero_ia32, @function; \ + .p2align 4; \ + .globl __bzero_ia32; \ + .hidden __bzero_ia32; \ + __bzero_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __bzero_ia32, .-__bzero_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI___bzero; __GI___bzero = __bzero_ia32 +# endif +#endif + +#include "../bzero.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c new file mode 100644 index 0000000000..e8026a2a78 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/ifunc-impl-list.c @@ -0,0 +1,376 @@ +/* Enumerate available IFUNC implementations of a function. i686 version. + Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <assert.h> +#include <string.h> +#include <wchar.h> +#include <ifunc-impl-list.h> +#include "init-arch.h" + +/* Maximum number of IFUNC implementations. */ +#define MAX_IFUNC 4 + +/* Fill ARRAY of MAX elements with IFUNC implementations for function + NAME and return the number of valid entries. */ + +size_t +__libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + size_t max) +{ + assert (max >= MAX_IFUNC); + + size_t i = 0; + + /* Support sysdeps/i386/i686/multiarch/bcopy.S. */ + IFUNC_IMPL (i, name, bcopy, + IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3), + __bcopy_ssse3_rep) + IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSSE3), + __bcopy_ssse3) + IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2), + __bcopy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/bzero.S. */ + IFUNC_IMPL (i, name, bzero, + IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2), + __bzero_sse2_rep) + IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2), + __bzero_sse2) + IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memchr.S. */ + IFUNC_IMPL (i, name, memchr, + IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2), + __memchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, memchr, HAS_CPU_FEATURE (SSE2), + __memchr_sse2) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memcmp.S. */ + IFUNC_IMPL (i, name, memcmp, + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_2), + __memcmp_sse4_2) + IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3), + __memcmp_ssse3) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memmove_chk.S. */ + IFUNC_IMPL (i, name, __memmove_chk, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), + __memmove_chk_ssse3_rep) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSSE3), + __memmove_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_CPU_FEATURE (SSE2), + __memmove_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memmove.S. */ + IFUNC_IMPL (i, name, memmove, + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), + __memmove_ssse3_rep) + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), + __memmove_ssse3) + IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2), + __memmove_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memrchr.S. */ + IFUNC_IMPL (i, name, memrchr, + IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2), + __memrchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, memrchr, HAS_CPU_FEATURE (SSE2), + __memrchr_sse2) + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memset_chk.S. */ + IFUNC_IMPL (i, name, __memset_chk, + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_CPU_FEATURE (SSE2), + __memset_chk_sse2_rep) + IFUNC_IMPL_ADD (array, i, __memset_chk, + HAS_CPU_FEATURE (SSE2), + __memset_chk_sse2) + IFUNC_IMPL_ADD (array, i, __memset_chk, 1, + __memset_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memset.S. */ + IFUNC_IMPL (i, name, memset, + IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2), + __memset_sse2_rep) + IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2), + __memset_sse2) + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32)) + + /* Support sysdeps/i386/i686/multiarch/rawmemchr.S. */ + IFUNC_IMPL (i, name, rawmemchr, + IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2), + __rawmemchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, rawmemchr, HAS_CPU_FEATURE (SSE2), + __rawmemchr_sse2) + IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/stpncpy.S. */ + IFUNC_IMPL (i, name, stpncpy, + IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3), + __stpncpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSE2), + __stpncpy_sse2) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/stpcpy.S. */ + IFUNC_IMPL (i, name, stpcpy, + IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3), + __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSE2), + __stpcpy_sse2) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcasecmp.S. */ + IFUNC_IMPL (i, name, strcasecmp, + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSE4_2), + __strcasecmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strcasecmp, + HAS_CPU_FEATURE (SSSE3), + __strcasecmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcasecmp_l.S. */ + IFUNC_IMPL (i, name, strcasecmp_l, + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSE4_2), + __strcasecmp_l_sse4_2) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + HAS_CPU_FEATURE (SSSE3), + __strcasecmp_l_ssse3) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, + __strcasecmp_l_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcat.S. */ + IFUNC_IMPL (i, name, strcat, + IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3), + __strcat_ssse3) + IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSE2), + __strcat_sse2) + IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strchr.S. */ + IFUNC_IMPL (i, name, strchr, + IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2), + __strchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, strchr, HAS_CPU_FEATURE (SSE2), + __strchr_sse2) + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcmp.S. */ + IFUNC_IMPL (i, name, strcmp, + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2), + __strcmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3), + __strcmp_ssse3) + IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcpy.S. */ + IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3), + __strcpy_ssse3) + IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSE2), + __strcpy_sse2) + IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strcspn.S. */ + IFUNC_IMPL (i, name, strcspn, + IFUNC_IMPL_ADD (array, i, strcspn, HAS_CPU_FEATURE (SSE4_2), + __strcspn_sse42) + IFUNC_IMPL_ADD (array, i, strcspn, 1, __strcspn_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncase.S. */ + IFUNC_IMPL (i, name, strncasecmp, + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSE4_2), + __strncasecmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strncasecmp, + HAS_CPU_FEATURE (SSSE3), + __strncasecmp_ssse3) + IFUNC_IMPL_ADD (array, i, strncasecmp, 1, + __strncasecmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncase_l.S. */ + IFUNC_IMPL (i, name, strncasecmp_l, + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSE4_2), + __strncasecmp_l_sse4_2) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + HAS_CPU_FEATURE (SSSE3), + __strncasecmp_l_ssse3) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, + __strncasecmp_l_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncat.S. */ + IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3), + __strncat_ssse3) + IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSE2), + __strncat_sse2) + IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncpy.S. */ + IFUNC_IMPL (i, name, strncpy, + IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3), + __strncpy_ssse3) + IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSE2), + __strncpy_sse2) + IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strnlen.S. */ + IFUNC_IMPL (i, name, strnlen, + IFUNC_IMPL_ADD (array, i, strnlen, HAS_CPU_FEATURE (SSE2), + __strnlen_sse2) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strpbrk.S. */ + IFUNC_IMPL (i, name, strpbrk, + IFUNC_IMPL_ADD (array, i, strpbrk, HAS_CPU_FEATURE (SSE4_2), + __strpbrk_sse42) + IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strrchr.S. */ + IFUNC_IMPL (i, name, strrchr, + IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2), + __strrchr_sse2_bsf) + IFUNC_IMPL_ADD (array, i, strrchr, HAS_CPU_FEATURE (SSE2), + __strrchr_sse2) + IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strspn.S. */ + IFUNC_IMPL (i, name, strspn, + IFUNC_IMPL_ADD (array, i, strspn, HAS_CPU_FEATURE (SSE4_2), + __strspn_sse42) + IFUNC_IMPL_ADD (array, i, strspn, 1, __strspn_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcschr.S. */ + IFUNC_IMPL (i, name, wcschr, + IFUNC_IMPL_ADD (array, i, wcschr, HAS_CPU_FEATURE (SSE2), + __wcschr_sse2) + IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcscmp.S. */ + IFUNC_IMPL (i, name, wcscmp, + IFUNC_IMPL_ADD (array, i, wcscmp, HAS_CPU_FEATURE (SSE2), + __wcscmp_sse2) + IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcscpy.S. */ + IFUNC_IMPL (i, name, wcscpy, + IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3), + __wcscpy_ssse3) + IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcslen.S. */ + IFUNC_IMPL (i, name, wcslen, + IFUNC_IMPL_ADD (array, i, wcslen, HAS_CPU_FEATURE (SSE2), + __wcslen_sse2) + IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wcsrchr.S. */ + IFUNC_IMPL (i, name, wcsrchr, + IFUNC_IMPL_ADD (array, i, wcsrchr, HAS_CPU_FEATURE (SSE2), + __wcsrchr_sse2) + IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_ia32)) + + /* Support sysdeps/i386/i686/multiarch/wmemcmp.S. */ + IFUNC_IMPL (i, name, wmemcmp, + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_2), + __wmemcmp_sse4_2) + IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3), + __wmemcmp_ssse3) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_ia32)) + +#ifdef SHARED + /* Support sysdeps/i386/i686/multiarch/memcpy_chk.S. */ + IFUNC_IMPL (i, name, __memcpy_chk, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __memcpy_chk_ssse3_rep) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __memcpy_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_CPU_FEATURE (SSE2), + __memcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/memcpy.S. */ + IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3_rep) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), + __memcpy_ssse3) + IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2), + __memcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S. */ + IFUNC_IMPL (i, name, __mempcpy_chk, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __mempcpy_chk_ssse3_rep) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSSE3), + __mempcpy_chk_ssse3) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_CPU_FEATURE (SSE2), + __mempcpy_chk_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_ia32)) + + /* Support sysdeps/i386/i686/multiarch/mempcpy.S. */ + IFUNC_IMPL (i, name, mempcpy, + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), + __mempcpy_ssse3_rep) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), + __mempcpy_ssse3) + IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2), + __mempcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strlen.S. */ + IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2), + __strlen_sse2_bsf) + IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE2), + __strlen_sse2) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_ia32)) + + /* Support sysdeps/i386/i686/multiarch/strncmp.S. */ + IFUNC_IMPL (i, name, strncmp, + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2), + __strncmp_sse4_2) + IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3), + __strncmp_ssse3) + IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_ia32)) +#endif + + return i; +} diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym new file mode 100644 index 0000000000..aebff9a4f9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/locale-defines.sym @@ -0,0 +1,11 @@ +#include <locale/localeinfo.h> +#include <langinfo.h> +#include <stddef.h> + +-- + +LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales) +LC_CTYPE +_NL_CTYPE_NONASCII_CASE +LOCALE_DATA_VALUES offsetof (struct __locale_data, values) +SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0]) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S new file mode 100644 index 0000000000..dd316486e6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S @@ -0,0 +1,502 @@ +/* Optimized memchr with sse2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + +# ifndef USE_AS_RAWMEMCHR +# define LEN STR2+4 +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); +# endif + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2_bsf +# endif + + .text +ENTRY (MEMCHR) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + +# ifndef USE_AS_RAWMEMCHR + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null_1) +# endif + mov %ecx, %eax + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + + cmp $48, %ecx + ja L(crosscache) + + movdqu (%eax), %xmm0 + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %ecx + test %ecx, %ecx + je L(unaligned_no_match_1) +/* Check which byte is a match. */ + bsf %ecx, %ecx + +# ifndef USE_AS_RAWMEMCHR + sub %ecx, %edx + jbe L(return_null_1) +# endif + add %ecx, %eax + ret + + .p2align 4 +L(unaligned_no_match_1): +# ifndef USE_AS_RAWMEMCHR + sub $16, %edx + jbe L(return_null_1) + PUSH (%edi) + lea 16(%eax), %edi + and $15, %eax + and $-16, %edi + add %eax, %edx +# else + lea 16(%eax), %edx + and $-16, %edx +# endif + jmp L(loop_prolog) + + .p2align 4 +L(return_null_1): + xor %eax, %eax + ret + +# ifndef USE_AS_RAWMEMCHR + CFI_POP (%edi) +# endif + + .p2align 4 +L(crosscache): +/* Handle unaligned string. */ + +# ifndef USE_AS_RAWMEMCHR + PUSH (%edi) + mov %eax, %edi + and $15, %ecx + and $-16, %edi + movdqa (%edi), %xmm0 +# else + mov %eax, %edx + and $15, %ecx + and $-16, %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +/* Check which byte is a match. */ + bsf %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + sub %eax, %edx + jbe L(return_null) + add %edi, %eax + add %ecx, %eax + RETURN +# else + add %edx, %eax + add %ecx, %eax + ret +# endif + + .p2align 4 +L(unaligned_no_match): +# ifndef USE_AS_RAWMEMCHR + /* Calculate the last acceptable address and check for possible + addition overflow by using satured math: + edx = ecx + edx + edx |= -(edx < ecx) */ + add %ecx, %edx + sbb %eax, %eax + or %eax, %edx + sub $16, %edx + jbe L(return_null) + add $16, %edi +# else + add $16, %edx +# endif + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + +# ifndef USE_AS_RAWMEMCHR + test $0x3f, %edi +# else + test $0x3f, %edx +# endif + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm3 +# else + movdqa 48(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + test %eax, %eax + jnz L(matches0) + +# ifndef USE_AS_RAWMEMCHR + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx +# else + and $-64, %edx +# endif + + .p2align 4 +L(align64_loop): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 +# else + movdqa (%edx), %xmm0 + movdqa 16(%edx), %xmm2 + movdqa 32(%edx), %xmm3 + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + pmovmskb %xmm4, %eax + +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + + test %eax, %eax + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edi +# else + sub $64, %edx +# endif + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + + pcmpeqb %xmm1, %xmm3 + +# ifndef USE_AS_RAWMEMCHR + pcmpeqb 48(%edi), %xmm1 +# else + pcmpeqb 48(%edx), %xmm1 +# endif + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + pmovmskb %xmm1, %eax + bsf %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + lea 48(%edi, %eax), %eax + RETURN +# else + lea 48(%edx, %eax), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches48_1) + xor %eax, %eax + RETURN + + .p2align 4 +L(exit_loop_32): + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 16(%edi), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches16_1) + xor %eax, %eax + RETURN +# endif + .p2align 4 +L(matches0): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea -16(%eax, %edi), %eax + RETURN +# else + lea -16(%eax, %edx), %eax + ret +# endif + + .p2align 4 +L(matches): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + add %edi, %eax + RETURN +# else + add %edx, %eax + ret +# endif + + .p2align 4 +L(matches16): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea 16(%eax, %edi), %eax + RETURN +# else + lea 16(%eax, %edx), %eax + ret +# endif + + .p2align 4 +L(matches32): + bsf %eax, %eax +# ifndef USE_AS_RAWMEMCHR + lea 32(%eax, %edi), %eax + RETURN +# else + lea 32(%eax, %edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(matches_1): + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + add %edi, %eax + RETURN + + .p2align 4 +L(matches16_1): + sub $16, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 16(%edi, %eax), %eax + RETURN + + .p2align 4 +L(matches32_1): + sub $32, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 32(%edi, %eax), %eax + RETURN + + .p2align 4 +L(matches48_1): + sub $48, %edx + bsf %eax, %eax + sub %eax, %edx + jbe L(return_null) + + lea 48(%edi, %eax), %eax + RETURN +# endif + .p2align 4 +L(return_null): + xor %eax, %eax +# ifndef USE_AS_RAWMEMCHR + RETURN +# else + ret +# endif + +END (MEMCHR) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S new file mode 100644 index 0000000000..172d70de13 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr-sse2.S @@ -0,0 +1,709 @@ +/* Optimized memchr with sse2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef USE_AS_RAWMEMCHR +# define ENTRANCE PUSH(%edi); +# define PARMS 8 +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); +# else +# define ENTRANCE +# define PARMS 4 +# endif + +# define STR1 PARMS +# define STR2 STR1+4 + +# ifndef USE_AS_RAWMEMCHR +# define LEN STR2+4 +# endif + +# ifndef MEMCHR +# define MEMCHR __memchr_sse2 +# endif + + atom_text_section +ENTRY (MEMCHR) + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 +# ifndef USE_AS_RAWMEMCHR + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null) +# endif + + punpcklbw %xmm1, %xmm1 +# ifndef USE_AS_RAWMEMCHR + mov %ecx, %edi +# else + mov %ecx, %edx +# endif + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + cmp $48, %ecx + ja L(crosscache) + +# ifndef USE_AS_RAWMEMCHR + movdqu (%edi), %xmm0 +# else + movdqu (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax +# ifndef USE_AS_RAWMEMCHR + jnz L(match_case2_prolog) + + sub $16, %edx + jbe L(return_null) + lea 16(%edi), %edi + and $15, %ecx + and $-16, %edi + add %ecx, %edx +# else + jnz L(match_case1_prolog) + lea 16(%edx), %edx + and $-16, %edx +# endif + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx +# ifndef USE_AS_RAWMEMCHR + and $-16, %edi + movdqa (%edi), %xmm0 +# else + and $-16, %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + sar %cl, %eax + test %eax, %eax + +# ifndef USE_AS_RAWMEMCHR + jnz L(match_case2_prolog1) + /* "ecx" is less than 16. Calculate "edx + ecx - 16" by using + "edx - (16 - ecx)" instead of "(edx + ecx) - 16" to void + possible addition overflow. */ + neg %ecx + add $16, %ecx + sub %ecx, %edx + jbe L(return_null) + lea 16(%edi), %edi +# else + jnz L(match_case1_prolog1) + lea 16(%edx), %edx +# endif + + .p2align 4 +L(loop_prolog): +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 +# else + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + lea 64(%edi), %edi + sub $64, %edx + jbe L(exit_loop) + + movdqa (%edi), %xmm0 +# else + lea 64(%edx), %edx + movdqa (%edx), %xmm0 +# endif + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 16(%edi), %xmm2 +# else + movdqa 16(%edx), %xmm2 +# endif + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 48(%edi), %xmm4 +# else + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + lea 64(%edi), %edi + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx +# else + lea 64(%edx), %edx + and $-64, %edx +# endif + + .p2align 4 +L(align64_loop): + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 +# else + movdqa (%edx), %xmm0 + movdqa 16(%edx), %xmm2 + movdqa 32(%edx), %xmm3 + movdqa 48(%edx), %xmm4 +# endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 +# ifndef USE_AS_RAWMEMCHR + add $64, %edi +# else + add $64, %edx +# endif + pmovmskb %xmm4, %eax + + test %eax, %eax + jz L(align64_loop) + +# ifndef USE_AS_RAWMEMCHR + sub $64, %edi +# else + sub $64, %edx +# endif + + pmovmskb %xmm0, %eax + xor %ecx, %ecx + test %eax, %eax + jnz L(match_case1) + + pmovmskb %xmm2, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + movdqa 32(%edi), %xmm3 +# else + movdqa 32(%edx), %xmm3 +# endif + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + +# ifndef USE_AS_RAWMEMCHR + pcmpeqb 48(%edi), %xmm1 +# else + pcmpeqb 48(%edx), %xmm1 +# endif + pmovmskb %xmm1, %eax + lea 16(%ecx), %ecx + + .p2align 4 +L(match_case1): +# ifndef USE_AS_RAWMEMCHR + add %ecx, %edi +# else +L(match_case1_prolog1): + add %ecx, %edx +L(match_case1_prolog): +# endif + test %al, %al + jz L(match_case1_high) + mov %al, %cl + and $15, %cl + jz L(match_case1_8) + test $0x01, %al + jnz L(ExitCase1_1) + test $0x02, %al + jnz L(ExitCase1_2) + test $0x04, %al + jnz L(ExitCase1_3) +# ifndef USE_AS_RAWMEMCHR + lea 3(%edi), %eax + RETURN +# else + lea 3(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_8): + test $0x10, %al + jnz L(ExitCase1_5) + test $0x20, %al + jnz L(ExitCase1_6) + test $0x40, %al + jnz L(ExitCase1_7) +# ifndef USE_AS_RAWMEMCHR + lea 7(%edi), %eax + RETURN +# else + lea 7(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_high): + mov %ah, %ch + and $15, %ch + jz L(match_case1_high_8) + test $0x01, %ah + jnz L(ExitCase1_9) + test $0x02, %ah + jnz L(ExitCase1_10) + test $0x04, %ah + jnz L(ExitCase1_11) +# ifndef USE_AS_RAWMEMCHR + lea 11(%edi), %eax + RETURN +# else + lea 11(%edx), %eax + ret +# endif + + .p2align 4 +L(match_case1_high_8): + test $0x10, %ah + jnz L(ExitCase1_13) + test $0x20, %ah + jnz L(ExitCase1_14) + test $0x40, %ah + jnz L(ExitCase1_15) +# ifndef USE_AS_RAWMEMCHR + lea 15(%edi), %eax + RETURN +# else + lea 15(%edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(exit_loop): + add $64, %edx + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case2) + cmp $16, %edx + jbe L(return_null) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case2) + cmp $32, %edx + jbe L(return_null) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case2) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + lea 16(%ecx), %ecx + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(match_case2) + + xor %eax, %eax + RETURN +# endif + + .p2align 4 +L(ExitCase1_1): +# ifndef USE_AS_RAWMEMCHR + mov %edi, %eax + RETURN +# else + mov %edx, %eax + ret +# endif + + .p2align 4 +L(ExitCase1_2): +# ifndef USE_AS_RAWMEMCHR + lea 1(%edi), %eax + RETURN +# else + lea 1(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_3): +# ifndef USE_AS_RAWMEMCHR + lea 2(%edi), %eax + RETURN +# else + lea 2(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_5): +# ifndef USE_AS_RAWMEMCHR + lea 4(%edi), %eax + RETURN +# else + lea 4(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_6): +# ifndef USE_AS_RAWMEMCHR + lea 5(%edi), %eax + RETURN +# else + lea 5(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_7): +# ifndef USE_AS_RAWMEMCHR + lea 6(%edi), %eax + RETURN +# else + lea 6(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_9): +# ifndef USE_AS_RAWMEMCHR + lea 8(%edi), %eax + RETURN +# else + lea 8(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_10): +# ifndef USE_AS_RAWMEMCHR + lea 9(%edi), %eax + RETURN +# else + lea 9(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_11): +# ifndef USE_AS_RAWMEMCHR + lea 10(%edi), %eax + RETURN +# else + lea 10(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_13): +# ifndef USE_AS_RAWMEMCHR + lea 12(%edi), %eax + RETURN +# else + lea 12(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_14): +# ifndef USE_AS_RAWMEMCHR + lea 13(%edi), %eax + RETURN +# else + lea 13(%edx), %eax + ret +# endif + + .p2align 4 +L(ExitCase1_15): +# ifndef USE_AS_RAWMEMCHR + lea 14(%edi), %eax + RETURN +# else + lea 14(%edx), %eax + ret +# endif + +# ifndef USE_AS_RAWMEMCHR + .p2align 4 +L(match_case2): + sub %ecx, %edx +L(match_case2_prolog1): + add %ecx, %edi +L(match_case2_prolog): + test %al, %al + jz L(match_case2_high) + mov %al, %cl + and $15, %cl + jz L(match_case2_8) + test $0x01, %al + jnz L(ExitCase2_1) + test $0x02, %al + jnz L(ExitCase2_2) + test $0x04, %al + jnz L(ExitCase2_3) + sub $4, %edx + jb L(return_null) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_8): + test $0x10, %al + jnz L(ExitCase2_5) + test $0x20, %al + jnz L(ExitCase2_6) + test $0x40, %al + jnz L(ExitCase2_7) + sub $8, %edx + jb L(return_null) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high): + mov %ah, %ch + and $15, %ch + jz L(match_case2_high_8) + test $0x01, %ah + jnz L(ExitCase2_9) + test $0x02, %ah + jnz L(ExitCase2_10) + test $0x04, %ah + jnz L(ExitCase2_11) + sub $12, %edx + jb L(return_null) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high_8): + test $0x10, %ah + jnz L(ExitCase2_13) + test $0x20, %ah + jnz L(ExitCase2_14) + test $0x40, %ah + jnz L(ExitCase2_15) + sub $16, %edx + jb L(return_null) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_1): + mov %edi, %eax + RETURN + + .p2align 4 +L(ExitCase2_2): + sub $2, %edx + jb L(return_null) + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_3): + sub $3, %edx + jb L(return_null) + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_5): + sub $5, %edx + jb L(return_null) + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_6): + sub $6, %edx + jb L(return_null) + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_7): + sub $7, %edx + jb L(return_null) + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_9): + sub $9, %edx + jb L(return_null) + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_10): + sub $10, %edx + jb L(return_null) + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_11): + sub $11, %edx + jb L(return_null) + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_13): + sub $13, %edx + jb L(return_null) + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_14): + sub $14, %edx + jb L(return_null) + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(ExitCase2_15): + sub $15, %edx + jb L(return_null) + lea 14(%edi), %eax + RETURN +# endif + + .p2align 4 +L(return_null): + xor %eax, %eax +# ifndef USE_AS_RAWMEMCHR + RETURN +# else + ret +# endif + +END (MEMCHR) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S new file mode 100644 index 0000000000..bd0dace290 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memchr.S @@ -0,0 +1,65 @@ +/* Multiple versions of memchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__memchr) + .type __memchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX ( __memchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__memchr_ia32) + ret + +3: LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf) + ret +END(__memchr) + +weak_alias(__memchr, memchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __memchr_ia32, @function; \ + .globl __memchr_ia32; \ + .p2align 4; \ + __memchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memchr_ia32, .-__memchr_ia32 + +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memchr; __GI_memchr = __memchr_ia32 + +#endif +#include "../../memchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S new file mode 100644 index 0000000000..2aa13048b2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-sse4.S @@ -0,0 +1,1225 @@ +/* memcmp with SSE4.2, wmemcmp with SSE4.2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_sse4_2 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1 + 4 +# define LEN BLK2 + 4 +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx) + + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +/* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ +/* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ +/* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ +/* We loaded the jump table and adjusted EDX/ESI. Go. */ \ + jmp *%ebx +# else +# define JMPTBL(I, B) I + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + .section .text.sse4.2,"ax",@progbits +ENTRY (MEMCMP) + movl BLK1(%esp), %eax + movl BLK2(%esp), %edx + movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(return0) +# else + cmp $1, %ecx + jbe L(less1bytes) +# endif + + pxor %xmm0, %xmm0 + cmp $64, %ecx + ja L(64bytesormore) + cmp $8, %ecx + +# ifndef USE_AS_WMEMCMP + PUSH (%ebx) + jb L(less8bytes) +# else + jb L(less8bytes) + PUSH (%ebx) +# endif + + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less8bytes): + mov (%eax), %bl + cmpb (%edx), %bl + jne L(nonzero) + + mov 1(%eax), %bl + cmpb 1(%edx), %bl + jne L(nonzero) + + cmp $2, %ecx + jz L(0bytes) + + mov 2(%eax), %bl + cmpb 2(%edx), %bl + jne L(nonzero) + + cmp $3, %ecx + jz L(0bytes) + + mov 3(%eax), %bl + cmpb 3(%edx), %bl + jne L(nonzero) + + cmp $4, %ecx + jz L(0bytes) + + mov 4(%eax), %bl + cmpb 4(%edx), %bl + jne L(nonzero) + + cmp $5, %ecx + jz L(0bytes) + + mov 5(%eax), %bl + cmpb 5(%edx), %bl + jne L(nonzero) + + cmp $6, %ecx + jz L(0bytes) + + mov 6(%eax), %bl + cmpb 6(%edx), %bl + je L(0bytes) + +L(nonzero): + POP (%ebx) + mov $1, %eax + ja L(above) + neg %eax +L(above): + ret + CFI_PUSH (%ebx) +# endif + + .p2align 4 +L(0bytes): + POP (%ebx) + xor %eax, %eax + ret + +# ifdef USE_AS_WMEMCMP + +/* for wmemcmp, case N == 1 */ + + .p2align 4 +L(less8bytes): + mov (%eax), %ecx + cmp (%edx), %ecx + je L(return0) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + + .p2align 4 +L(return0): + xor %eax, %eax + ret +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less1bytes): + jb L(0bytesend) + movzbl (%eax), %eax + movzbl (%edx), %edx + sub %edx, %eax + ret + + .p2align 4 +L(0bytesend): + xor %eax, %eax + ret +# endif + .p2align 4 +L(64bytesormore): + PUSH (%ebx) + mov %ecx, %ebx + mov $64, %ecx + sub $64, %ebx +L(64bytesormore_loop): + movdqu (%eax), %xmm1 + movdqu (%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_16diff) + + movdqu 16(%eax), %xmm1 + movdqu 16(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_32diff) + + movdqu 32(%eax), %xmm1 + movdqu 32(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_48diff) + + movdqu 48(%eax), %xmm1 + movdqu 48(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(find_64diff) + add %ecx, %eax + add %ecx, %edx + sub %ecx, %ebx + jae L(64bytesormore_loop) + add %ebx, %ecx + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %ecx, 4) + +# ifdef USE_AS_WMEMCMP + +/* Label needs only for table_64bytes filling */ +L(unreal_case): +/* no code here */ + +# endif + .p2align 4 +L(find_16diff): + sub $16, %ecx +L(find_32diff): + sub $16, %ecx +L(find_48diff): + sub $16, %ecx +L(find_64diff): + add %ecx, %edx + add %ecx, %eax + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# else + .p2align 4 +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + cmp -4(%edx), %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# endif + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(49bytes): + movdqu -49(%eax), %xmm1 + movdqu -49(%edx), %xmm2 + mov $-49, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%eax), %xmm1 + movdqu -33(%edx), %xmm2 + mov $-33, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(50bytes): + mov $-50, %ebx + movdqu -50(%eax), %xmm1 + movdqu -50(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + mov $-34, %ebx + movdqu -34(%eax), %xmm1 + movdqu -34(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(51bytes): + mov $-51, %ebx + movdqu -51(%eax), %xmm1 + movdqu -51(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + mov $-35, %ebx + movdqu -35(%eax), %xmm1 + movdqu -35(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) +L(1bytes): + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(52bytes): + movdqu -52(%eax), %xmm1 + movdqu -52(%edx), %xmm2 + mov $-52, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%eax), %xmm1 + movdqu -36(%edx), %xmm2 + mov $-36, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%eax), %xmm1 + movdqu -20(%edx), %xmm2 + mov $-20, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(53bytes): + movdqu -53(%eax), %xmm1 + movdqu -53(%edx), %xmm2 + mov $-53, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + mov $-37, %ebx + movdqu -37(%eax), %xmm1 + movdqu -37(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + mov $-21, %ebx + movdqu -21(%eax), %xmm1 + movdqu -21(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(54bytes): + movdqu -54(%eax), %xmm1 + movdqu -54(%edx), %xmm2 + mov $-54, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + mov $-38, %ebx + movdqu -38(%eax), %xmm1 + movdqu -38(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + mov $-22, %ebx + movdqu -22(%eax), %xmm1 + movdqu -22(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(55bytes): + movdqu -55(%eax), %xmm1 + movdqu -55(%edx), %xmm2 + mov $-55, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + mov $-39, %ebx + movdqu -39(%eax), %xmm1 + movdqu -39(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + mov $-23, %ebx + movdqu -23(%eax), %xmm1 + movdqu -23(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(56bytes): + movdqu -56(%eax), %xmm1 + movdqu -56(%edx), %xmm2 + mov $-56, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + mov $-40, %ebx + movdqu -40(%eax), %xmm1 + movdqu -40(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + mov $-24, %ebx + movdqu -24(%eax), %xmm1 + movdqu -24(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(57bytes): + movdqu -57(%eax), %xmm1 + movdqu -57(%edx), %xmm2 + mov $-57, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + mov $-41, %ebx + movdqu -41(%eax), %xmm1 + movdqu -41(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + mov $-25, %ebx + movdqu -25(%eax), %xmm1 + movdqu -25(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(58bytes): + movdqu -58(%eax), %xmm1 + movdqu -58(%edx), %xmm2 + mov $-58, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + mov $-42, %ebx + movdqu -42(%eax), %xmm1 + movdqu -42(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + mov $-26, %ebx + movdqu -26(%eax), %xmm1 + movdqu -26(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(59bytes): + movdqu -59(%eax), %xmm1 + movdqu -59(%edx), %xmm2 + mov $-59, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + mov $-43, %ebx + movdqu -43(%eax), %xmm1 + movdqu -43(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + mov $-27, %ebx + movdqu -27(%eax), %xmm1 + movdqu -27(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + .p2align 4 +L(60bytes): + movdqu -60(%eax), %xmm1 + movdqu -60(%edx), %xmm2 + mov $-60, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + mov $-44, %ebx + movdqu -44(%eax), %xmm1 + movdqu -44(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + mov $-28, %ebx + movdqu -28(%eax), %xmm1 + movdqu -28(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -12(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif + jne L(find_diff) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(61bytes): + movdqu -61(%eax), %xmm1 + movdqu -61(%edx), %xmm2 + mov $-61, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + mov $-45, %ebx + movdqu -45(%eax), %xmm1 + movdqu -45(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + mov $-29, %ebx + movdqu -29(%eax), %xmm1 + movdqu -29(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(62bytes): + movdqu -62(%eax), %xmm1 + movdqu -62(%edx), %xmm2 + mov $-62, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + mov $-46, %ebx + movdqu -46(%eax), %xmm1 + movdqu -46(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + mov $-30, %ebx + movdqu -30(%eax), %xmm1 + movdqu -30(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + RETURN + + .p2align 4 +L(63bytes): + movdqu -63(%eax), %xmm1 + movdqu -63(%edx), %xmm2 + mov $-63, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + mov $-47, %ebx + movdqu -47(%eax), %xmm1 + movdqu -47(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + mov $-31, %ebx + movdqu -31(%eax), %xmm1 + movdqu -31(%edx), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + RETURN +# endif + + .p2align 4 +L(64bytes): + movdqu -64(%eax), %xmm1 + movdqu -64(%edx), %xmm2 + mov $-64, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%eax), %xmm1 + movdqu -48(%edx), %xmm2 + mov $-48, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%eax), %xmm1 + movdqu -32(%edx), %xmm2 + mov $-32, %ebx + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -16(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -16(%edx), %ecx +# endif + jne L(find_diff) + + mov -12(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -12(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -12(%edx), %ecx +# endif + jne L(find_diff) + + mov -8(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -8(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -8(%edx), %ecx +# endif + jne L(find_diff) + + mov -4(%eax), %ecx +# ifndef USE_AS_WMEMCMP + mov -4(%edx), %ebx + cmp %ebx, %ecx +# else + cmp -4(%edx), %ecx +# endif + mov $0, %eax + jne L(find_diff) + RETURN + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + mov (%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + mov 4(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + mov 8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + mov 12(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + RETURN +# else + .p2align 4 +L(less16bytes): + add %ebx, %eax + add %ebx, %edx + + mov (%eax), %ecx + cmp (%edx), %ecx + jne L(find_diff) + + mov 4(%eax), %ecx + cmp 4(%edx), %ecx + jne L(find_diff) + + mov 8(%eax), %ecx + cmp 8(%edx), %ecx + jne L(find_diff) + + mov 12(%eax), %ecx + cmp 12(%edx), %ecx + + mov $0, %eax + jne L(find_diff) + RETURN +# endif + + .p2align 4 +L(find_diff): +# ifndef USE_AS_WMEMCMP + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret +# else + POP (%ebx) + mov $1, %eax + jg L(bigger) + neg %eax + ret + + .p2align 4 +L(bigger): + ret +# endif +END (MEMCMP) + + .section .rodata.sse4.2,"a",@progbits + .p2align 2 + .type L(table_64bytes), @object +# ifndef USE_AS_WMEMCMP +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S new file mode 100644 index 0000000000..5ebf5a4d73 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp-ssse3.S @@ -0,0 +1,2157 @@ +/* memcmp with SSSE3, wmemcmp with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef MEMCMP +# define MEMCMP __memcmp_ssse3 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define BLK1 PARMS +# define BLK2 BLK1+4 +# define LEN BLK2+4 +# define RETURN_END POP (%edi); POP (%esi); POP (%ebx); ret +# define RETURN RETURN_END; cfi_restore_state; cfi_remember_state + +/* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +*/ + + atom_text_section +ENTRY (MEMCMP) + movl LEN(%esp), %ecx + +# ifdef USE_AS_WMEMCMP + shl $2, %ecx + test %ecx, %ecx + jz L(zero) +# endif + + movl BLK1(%esp), %eax + cmp $48, %ecx + movl BLK2(%esp), %edx + jae L(48bytesormore) + +# ifndef USE_AS_WMEMCMP + cmp $1, %ecx + jbe L(less1bytes) +# endif + + PUSH (%ebx) + add %ecx, %edx + add %ecx, %eax + jmp L(less48bytes) + + CFI_POP (%ebx) + +# ifndef USE_AS_WMEMCMP + .p2align 4 +L(less1bytes): + jb L(zero) + movb (%eax), %cl + cmp (%edx), %cl + je L(zero) + mov $1, %eax + ja L(1bytesend) + neg %eax +L(1bytesend): + ret +# endif + + .p2align 4 +L(zero): + xor %eax, %eax + ret + + .p2align 4 +L(48bytesormore): + PUSH (%ebx) + PUSH (%esi) + PUSH (%edi) + cfi_remember_state + movdqu (%eax), %xmm3 + movdqu (%edx), %xmm0 + movl %eax, %edi + movl %edx, %esi + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + lea 16(%edi), %edi + + sub $0xffff, %edx + lea 16(%esi), %esi + jnz L(less16bytes) + mov %edi, %edx + and $0xf, %edx + xor %edx, %edi + sub %edx, %esi + add %edx, %ecx + mov %esi, %edx + and $0xf, %edx + jz L(shr_0) + xor %edx, %esi + +# ifndef USE_AS_WMEMCMP + cmp $8, %edx + jae L(next_unaligned_table) + cmp $0, %edx + je L(shr_0) + cmp $1, %edx + je L(shr_1) + cmp $2, %edx + je L(shr_2) + cmp $3, %edx + je L(shr_3) + cmp $4, %edx + je L(shr_4) + cmp $5, %edx + je L(shr_5) + cmp $6, %edx + je L(shr_6) + jmp L(shr_7) + + .p2align 2 +L(next_unaligned_table): + cmp $8, %edx + je L(shr_8) + cmp $9, %edx + je L(shr_9) + cmp $10, %edx + je L(shr_10) + cmp $11, %edx + je L(shr_11) + cmp $12, %edx + je L(shr_12) + cmp $13, %edx + je L(shr_13) + cmp $14, %edx + je L(shr_14) + jmp L(shr_15) +# else + cmp $0, %edx + je L(shr_0) + cmp $4, %edx + je L(shr_4) + cmp $8, %edx + je L(shr_8) + jmp L(shr_12) +# endif + + .p2align 4 +L(shr_0): + cmp $80, %ecx + jae L(shr_0_gobble) + lea -48(%ecx), %ecx + xor %eax, %eax + movaps (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + movaps 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 + pand %xmm1, %xmm2 + pmovmskb %xmm2, %edx + add $32, %edi + add $32, %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_0_gobble): + lea -48(%ecx), %ecx + movdqa (%esi), %xmm0 + xor %eax, %eax + pcmpeqb (%edi), %xmm0 + sub $32, %ecx + movdqa 16(%esi), %xmm2 + pcmpeqb 16(%edi), %xmm2 +L(shr_0_gobble_loop): + pand %xmm0, %xmm2 + sub $32, %ecx + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + movdqa 32(%esi), %xmm0 + movdqa 48(%esi), %xmm2 + sbb $0xffff, %edx + pcmpeqb 32(%edi), %xmm0 + pcmpeqb 48(%edi), %xmm2 + lea 32(%edi), %edi + lea 32(%esi), %esi + jz L(shr_0_gobble_loop) + + pand %xmm0, %xmm2 + cmp $0, %ecx + jge L(shr_0_gobble_loop_next) + inc %edx + add $32, %ecx +L(shr_0_gobble_loop_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm2, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea (%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_1): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_1_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $1,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $1,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_1_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $1,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $1,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_1_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $1,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $1,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_1_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_1_gobble_next) + inc %edx + add $32, %ecx +L(shr_1_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 1(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_2): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_2_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $2,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $2,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_2_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $2,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $2,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_2_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $2,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $2,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_2_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_2_gobble_next) + inc %edx + add $32, %ecx +L(shr_2_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 2(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_3): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_3_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $3,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $3,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_3_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $3,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $3,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_3_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $3,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $3,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_3_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_3_gobble_next) + inc %edx + add $32, %ecx +L(shr_3_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 3(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_4): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_4_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $4,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $4,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_4_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $4,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $4,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_4_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $4,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $4,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_4_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_4_gobble_next) + inc %edx + add $32, %ecx +L(shr_4_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 4(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_5): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_5_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $5,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $5,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_5_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $5,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $5,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_5_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $5,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $5,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_5_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_5_gobble_next) + inc %edx + add $32, %ecx +L(shr_5_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 5(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_6): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_6_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $6,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $6,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_6_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $6,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $6,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_6_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $6,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $6,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_6_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_6_gobble_next) + inc %edx + add $32, %ecx +L(shr_6_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 6(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_7): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_7_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $7,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $7,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_7_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $7,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $7,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_7_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $7,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $7,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_7_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_7_gobble_next) + inc %edx + add $32, %ecx +L(shr_7_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 7(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_8): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_8_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $8,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $8,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_8_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $8,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $8,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_8_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $8,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $8,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_8_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_8_gobble_next) + inc %edx + add $32, %ecx +L(shr_8_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 8(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_9): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_9_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $9,(%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $9,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_9_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $9,(%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $9,16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_9_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $9,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $9,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_9_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_9_gobble_next) + inc %edx + add $32, %ecx +L(shr_9_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 9(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_10): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_10_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $10, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $10,%xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_10_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $10, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $10, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_10_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $10,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $10,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_10_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_10_gobble_next) + inc %edx + add $32, %ecx +L(shr_10_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 10(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_11): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_11_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $11, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $11, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_11_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $11, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $11, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_11_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $11,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $11,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_11_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_11_gobble_next) + inc %edx + add $32, %ecx +L(shr_11_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 11(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_12): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_12_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $12, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $12, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_12_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $12, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $12, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_12_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $12,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $12,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_12_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_12_gobble_next) + inc %edx + add $32, %ecx +L(shr_12_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 12(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + +# ifndef USE_AS_WMEMCMP + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_13): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_13_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $13, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $13, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_13_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $13, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $13, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_13_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $13,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $13,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_13_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_13_gobble_next) + inc %edx + add $32, %ecx +L(shr_13_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 13(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_14): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_14_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $14, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $14, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_14_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $14, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $14, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_14_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $14,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $14,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_14_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_14_gobble_next) + inc %edx + add $32, %ecx +L(shr_14_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 14(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_15): + cmp $80, %ecx + lea -48(%ecx), %ecx + mov %edx, %eax + jae L(shr_15_gobble) + + movdqa 16(%esi), %xmm1 + movdqa %xmm1, %xmm2 + palignr $15, (%esi), %xmm1 + pcmpeqb (%edi), %xmm1 + + movdqa 32(%esi), %xmm3 + palignr $15, %xmm2, %xmm3 + pcmpeqb 16(%edi), %xmm3 + + pand %xmm1, %xmm3 + pmovmskb %xmm3, %edx + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(shr_15_gobble): + sub $32, %ecx + movdqa 16(%esi), %xmm0 + palignr $15, (%esi), %xmm0 + pcmpeqb (%edi), %xmm0 + + movdqa 32(%esi), %xmm3 + palignr $15, 16(%esi), %xmm3 + pcmpeqb 16(%edi), %xmm3 + +L(shr_15_gobble_loop): + pand %xmm0, %xmm3 + sub $32, %ecx + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + + movdqa 64(%esi), %xmm3 + palignr $15,48(%esi), %xmm3 + sbb $0xffff, %edx + movdqa 48(%esi), %xmm0 + palignr $15,32(%esi), %xmm0 + pcmpeqb 32(%edi), %xmm0 + lea 32(%esi), %esi + pcmpeqb 48(%edi), %xmm3 + + lea 32(%edi), %edi + jz L(shr_15_gobble_loop) + pand %xmm0, %xmm3 + + cmp $0, %ecx + jge L(shr_15_gobble_next) + inc %edx + add $32, %ecx +L(shr_15_gobble_next): + test %edx, %edx + jnz L(exit) + + pmovmskb %xmm3, %edx + movdqa %xmm0, %xmm1 + lea 32(%edi), %edi + lea 32(%esi), %esi + sub $0xffff, %edx + jnz L(exit) + + lea (%ecx, %edi,1), %eax + lea 15(%ecx, %esi,1), %edx + POP (%edi) + POP (%esi) + jmp L(less48bytes) +# endif + + cfi_restore_state + cfi_remember_state + .p2align 4 +L(exit): + pmovmskb %xmm1, %ebx + sub $0xffff, %ebx + jz L(first16bytes) + lea -16(%esi), %esi + lea -16(%edi), %edi + mov %ebx, %edx + +L(first16bytes): + add %eax, %esi +L(less16bytes): + +# ifndef USE_AS_WMEMCMP + test %dl, %dl + jz L(next_24_bytes) + + test $0x01, %dl + jnz L(Byte16) + + test $0x02, %dl + jnz L(Byte17) + + test $0x04, %dl + jnz L(Byte18) + + test $0x08, %dl + jnz L(Byte19) + + test $0x10, %dl + jnz L(Byte20) + + test $0x20, %dl + jnz L(Byte21) + + test $0x40, %dl + jnz L(Byte22) +L(Byte23): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte16): + movzbl -16(%edi), %eax + movzbl -16(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte17): + movzbl -15(%edi), %eax + movzbl -15(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte18): + movzbl -14(%edi), %eax + movzbl -14(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte19): + movzbl -13(%edi), %eax + movzbl -13(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte20): + movzbl -12(%edi), %eax + movzbl -12(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte21): + movzbl -11(%edi), %eax + movzbl -11(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(Byte22): + movzbl -10(%edi), %eax + movzbl -10(%esi), %edx + sub %edx, %eax + RETURN + + .p2align 4 +L(next_24_bytes): + lea 8(%edi), %edi + lea 8(%esi), %esi + test $0x01, %dh + jnz L(Byte16) + + test $0x02, %dh + jnz L(Byte17) + + test $0x04, %dh + jnz L(Byte18) + + test $0x08, %dh + jnz L(Byte19) + + test $0x10, %dh + jnz L(Byte20) + + test $0x20, %dh + jnz L(Byte21) + + test $0x40, %dh + jnz L(Byte22) + + .p2align 4 +L(Byte31): + movzbl -9(%edi), %eax + movzbl -9(%esi), %edx + sub %edx, %eax + RETURN_END +# else + +/* special for wmemcmp */ + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov -16(%edi), %eax + cmp -16(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov -12(%edi), %eax + cmp -12(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov -8(%edi), %eax + cmp -8(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov -4(%edi), %eax + cmp -4(%esi), %eax + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(nequal_bigger) + neg %eax + RETURN + + .p2align 4 +L(nequal_bigger): + RETURN_END +# endif + + CFI_PUSH (%ebx) + + .p2align 4 +L(more8bytes): + cmp $16, %ecx + jae L(more16bytes) + cmp $8, %ecx + je L(8bytes) +# ifndef USE_AS_WMEMCMP + cmp $9, %ecx + je L(9bytes) + cmp $10, %ecx + je L(10bytes) + cmp $11, %ecx + je L(11bytes) + cmp $12, %ecx + je L(12bytes) + cmp $13, %ecx + je L(13bytes) + cmp $14, %ecx + je L(14bytes) + jmp L(15bytes) +# else + jmp L(12bytes) +# endif + + .p2align 4 +L(more16bytes): + cmp $24, %ecx + jae L(more24bytes) + cmp $16, %ecx + je L(16bytes) +# ifndef USE_AS_WMEMCMP + cmp $17, %ecx + je L(17bytes) + cmp $18, %ecx + je L(18bytes) + cmp $19, %ecx + je L(19bytes) + cmp $20, %ecx + je L(20bytes) + cmp $21, %ecx + je L(21bytes) + cmp $22, %ecx + je L(22bytes) + jmp L(23bytes) +# else + jmp L(20bytes) +# endif + + .p2align 4 +L(more24bytes): + cmp $32, %ecx + jae L(more32bytes) + cmp $24, %ecx + je L(24bytes) +# ifndef USE_AS_WMEMCMP + cmp $25, %ecx + je L(25bytes) + cmp $26, %ecx + je L(26bytes) + cmp $27, %ecx + je L(27bytes) + cmp $28, %ecx + je L(28bytes) + cmp $29, %ecx + je L(29bytes) + cmp $30, %ecx + je L(30bytes) + jmp L(31bytes) +# else + jmp L(28bytes) +# endif + + .p2align 4 +L(more32bytes): + cmp $40, %ecx + jae L(more40bytes) + cmp $32, %ecx + je L(32bytes) +# ifndef USE_AS_WMEMCMP + cmp $33, %ecx + je L(33bytes) + cmp $34, %ecx + je L(34bytes) + cmp $35, %ecx + je L(35bytes) + cmp $36, %ecx + je L(36bytes) + cmp $37, %ecx + je L(37bytes) + cmp $38, %ecx + je L(38bytes) + jmp L(39bytes) +# else + jmp L(36bytes) +# endif + + .p2align 4 +L(less48bytes): + cmp $8, %ecx + jae L(more8bytes) +# ifndef USE_AS_WMEMCMP + cmp $2, %ecx + je L(2bytes) + cmp $3, %ecx + je L(3bytes) + cmp $4, %ecx + je L(4bytes) + cmp $5, %ecx + je L(5bytes) + cmp $6, %ecx + je L(6bytes) + jmp L(7bytes) +# else + jmp L(4bytes) +# endif + + .p2align 4 +L(more40bytes): + cmp $40, %ecx + je L(40bytes) +# ifndef USE_AS_WMEMCMP + cmp $41, %ecx + je L(41bytes) + cmp $42, %ecx + je L(42bytes) + cmp $43, %ecx + je L(43bytes) + cmp $44, %ecx + je L(44bytes) + cmp $45, %ecx + je L(45bytes) + cmp $46, %ecx + je L(46bytes) + jmp L(47bytes) + + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + mov -44(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + mov -40(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + mov -36(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + mov -32(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + mov -28(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + mov -24(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + mov -20(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + mov -16(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + mov -12(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + mov -8(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + mov -4(%edx), %ebx + cmp %ebx, %ecx + mov $0, %eax + jne L(find_diff) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# else + .p2align 4 +L(44bytes): + mov -44(%eax), %ecx + cmp -44(%edx), %ecx + jne L(find_diff) +L(40bytes): + mov -40(%eax), %ecx + cmp -40(%edx), %ecx + jne L(find_diff) +L(36bytes): + mov -36(%eax), %ecx + cmp -36(%edx), %ecx + jne L(find_diff) +L(32bytes): + mov -32(%eax), %ecx + cmp -32(%edx), %ecx + jne L(find_diff) +L(28bytes): + mov -28(%eax), %ecx + cmp -28(%edx), %ecx + jne L(find_diff) +L(24bytes): + mov -24(%eax), %ecx + cmp -24(%edx), %ecx + jne L(find_diff) +L(20bytes): + mov -20(%eax), %ecx + cmp -20(%edx), %ecx + jne L(find_diff) +L(16bytes): + mov -16(%eax), %ecx + cmp -16(%edx), %ecx + jne L(find_diff) +L(12bytes): + mov -12(%eax), %ecx + cmp -12(%edx), %ecx + jne L(find_diff) +L(8bytes): + mov -8(%eax), %ecx + cmp -8(%edx), %ecx + jne L(find_diff) +L(4bytes): + mov -4(%eax), %ecx + xor %eax, %eax + cmp -4(%edx), %ecx + jne L(find_diff) + POP (%ebx) + ret + CFI_PUSH (%ebx) +# endif + +# ifndef USE_AS_WMEMCMP + + .p2align 4 +L(45bytes): + mov -45(%eax), %ecx + mov -45(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(41bytes): + mov -41(%eax), %ecx + mov -41(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(37bytes): + mov -37(%eax), %ecx + mov -37(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(33bytes): + mov -33(%eax), %ecx + mov -33(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(29bytes): + mov -29(%eax), %ecx + mov -29(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(25bytes): + mov -25(%eax), %ecx + mov -25(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(21bytes): + mov -21(%eax), %ecx + mov -21(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(17bytes): + mov -17(%eax), %ecx + mov -17(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(13bytes): + mov -13(%eax), %ecx + mov -13(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(9bytes): + mov -9(%eax), %ecx + mov -9(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(5bytes): + mov -5(%eax), %ecx + mov -5(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) + movzbl -1(%eax), %ecx + cmp -1(%edx), %cl + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(46bytes): + mov -46(%eax), %ecx + mov -46(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(42bytes): + mov -42(%eax), %ecx + mov -42(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(38bytes): + mov -38(%eax), %ecx + mov -38(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(34bytes): + mov -34(%eax), %ecx + mov -34(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(30bytes): + mov -30(%eax), %ecx + mov -30(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(26bytes): + mov -26(%eax), %ecx + mov -26(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(22bytes): + mov -22(%eax), %ecx + mov -22(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(18bytes): + mov -18(%eax), %ecx + mov -18(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(14bytes): + mov -14(%eax), %ecx + mov -14(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(10bytes): + mov -10(%eax), %ecx + mov -10(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(6bytes): + mov -6(%eax), %ecx + mov -6(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(2bytes): + movzwl -2(%eax), %ecx + movzwl -2(%edx), %ebx + cmp %bl, %cl + jne L(end) + cmp %bh, %ch + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(47bytes): + movl -47(%eax), %ecx + movl -47(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(43bytes): + movl -43(%eax), %ecx + movl -43(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(39bytes): + movl -39(%eax), %ecx + movl -39(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(35bytes): + movl -35(%eax), %ecx + movl -35(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(31bytes): + movl -31(%eax), %ecx + movl -31(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(27bytes): + movl -27(%eax), %ecx + movl -27(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(23bytes): + movl -23(%eax), %ecx + movl -23(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(19bytes): + movl -19(%eax), %ecx + movl -19(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(15bytes): + movl -15(%eax), %ecx + movl -15(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(11bytes): + movl -11(%eax), %ecx + movl -11(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(7bytes): + movl -7(%eax), %ecx + movl -7(%edx), %ebx + cmp %ebx, %ecx + jne L(find_diff) +L(3bytes): + movzwl -3(%eax), %ecx + movzwl -3(%edx), %ebx + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + movzbl -1(%eax), %eax + cmpb -1(%edx), %al + mov $0, %eax + jne L(end) + POP (%ebx) + ret + CFI_PUSH (%ebx) + + .p2align 4 +L(find_diff): + cmpb %bl, %cl + jne L(end) + cmp %bx, %cx + jne L(end) + shr $16,%ecx + shr $16,%ebx + cmp %bl, %cl + jne L(end) + cmp %bx, %cx + + .p2align 4 +L(end): + POP (%ebx) + mov $1, %eax + ja L(bigger) + neg %eax +L(bigger): + ret +# else + +/* for wmemcmp */ + .p2align 4 +L(find_diff): + POP (%ebx) + mov $1, %eax + jg L(find_diff_bigger) + neg %eax + ret + + .p2align 4 +L(find_diff_bigger): + ret + +# endif +END (MEMCMP) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S new file mode 100644 index 0000000000..1fc5994a17 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcmp.S @@ -0,0 +1,62 @@ +/* Multiple versions of memcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(memcmp) + .type memcmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memcmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memcmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (__memcmp_sse4_2) +2: ret +END(memcmp) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcmp_ia32, @function; \ + .p2align 4; \ + .globl __memcmp_ia32; \ + .hidden __memcmp_ia32; \ + __memcmp_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcmp_ia32, .-__memcmp_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcmp; __GI_memcmp = __memcmp_ia32 +# endif +#endif + +#include "../memcmp.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S new file mode 100644 index 0000000000..2fe2072cb1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-sse2-unaligned.S @@ -0,0 +1,681 @@ +/* memcpy optimized with SSE2 unaligned memory access instructions. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +# include <sysdep.h> +# include "asm-syntax.h" + +# ifndef MEMCPY +# define MEMCPY __memcpy_sse2_unaligned +# define MEMCPY_CHK __memcpy_chk_sse2_unaligned +# endif + +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) + + .section .text.sse2,"ax",@progbits +# if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +# endif + +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + cmp %edx, %eax + +# ifdef USE_AS_MEMMOVE + jg L(check_forward) + +L(mm_len_0_or_more_backward): +/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] + separately. */ + cmp $16, %ecx + jbe L(mm_len_0_16_bytes_backward) + + cmpl $32, %ecx + jg L(mm_len_32_or_more_backward) + +/* Copy [0..32] and return. */ + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_32_or_more_backward): + cmpl $64, %ecx + jg L(mm_len_64_or_more_backward) + +/* Copy [0..64] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu -16(%eax, %ecx), %xmm2 + movdqu -32(%eax, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -16(%edx, %ecx) + movdqu %xmm3, -32(%edx, %ecx) + jmp L(return) + +L(mm_len_64_or_more_backward): + cmpl $128, %ecx + jg L(mm_len_128_or_more_backward) + +/* Copy [0..128] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_128_or_more_backward): + add %ecx, %eax + cmp %edx, %eax + movl SRC(%esp), %eax + jle L(forward) + PUSH (%esi) + PUSH (%edi) + PUSH (%ebx) + +/* Aligning the address of destination. */ + movdqu (%eax), %xmm4 + movdqu 16(%eax), %xmm5 + movdqu 32(%eax), %xmm6 + movdqu 48(%eax), %xmm7 + leal (%edx, %ecx), %esi + movdqu -16(%eax, %ecx), %xmm0 + subl $16, %esp + movdqu %xmm0, (%esp) + mov %ecx, %edi + movl %esi, %ecx + andl $-16, %ecx + leal (%ecx), %ebx + subl %edx, %ebx + leal (%eax, %ebx), %eax + shrl $6, %ebx + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %edi +# else +# ifdef SHARED + PUSH (%ebx) + SETUP_PIC_REG (bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi + POP (%ebx) +# else + cmp __x86_shared_cache_size_half, %edi +# endif +# endif + jae L(mm_large_page_loop_backward) + + .p2align 4 +L(mm_main_loop_backward): + + prefetcht0 -128(%eax) + + movdqu -64(%eax), %xmm0 + movdqu -48(%eax), %xmm1 + movdqu -32(%eax), %xmm2 + movdqu -16(%eax), %xmm3 + movaps %xmm0, -64(%ecx) + subl $64, %eax + movaps %xmm1, -48(%ecx) + movaps %xmm2, -32(%ecx) + movaps %xmm3, -16(%ecx) + subl $64, %ecx + sub $1, %ebx + jnz L(mm_main_loop_backward) + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, -16(%esi) + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + POP (%ebx) + jmp L(mm_return_pop_all) + +/* Copy [0..16] and return. */ +L(mm_len_0_16_bytes_backward): + testb $24, %cl + jnz L(mm_len_9_16_bytes_backward) + testb $4, %cl + .p2align 4,,5 + jnz L(mm_len_5_8_bytes_backward) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + testb $2, %cl + .p2align 4,,1 + jne L(mm_len_3_4_bytes_backward) + movzbl -1(%eax,%ecx), %ebx + movzbl (%eax), %eax + movb %bl, -1(%edx,%ecx) + movb %al, (%edx) + jmp L(return) + +L(mm_len_3_4_bytes_backward): + movzwl -2(%eax,%ecx), %ebx + movzwl (%eax), %eax + movw %bx, -2(%edx,%ecx) + movw %ax, (%edx) + jmp L(return) + +L(mm_len_9_16_bytes_backward): + PUSH (%esi) + movl -4(%eax,%ecx), %ebx + movl -8(%eax,%ecx), %esi + movl %ebx, -4(%edx,%ecx) + movl %esi, -8(%edx,%ecx) + subl $8, %ecx + POP (%esi) + jmp L(mm_len_0_16_bytes_backward) + +L(mm_len_5_8_bytes_backward): + movl (%eax), %ebx + movl -4(%eax,%ecx), %eax + movl %ebx, (%edx) + movl %eax, -4(%edx,%ecx) + jmp L(return) + +/* Big length copy backward part. */ + .p2align 4 +L(mm_large_page_loop_backward): + movdqu -64(%eax), %xmm0 + movdqu -48(%eax), %xmm1 + movdqu -32(%eax), %xmm2 + movdqu -16(%eax), %xmm3 + movntdq %xmm0, -64(%ecx) + subl $64, %eax + movntdq %xmm1, -48(%ecx) + movntdq %xmm2, -32(%ecx) + movntdq %xmm3, -16(%ecx) + subl $64, %ecx + sub $1, %ebx + jnz L(mm_large_page_loop_backward) + sfence + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, -16(%esi) + movdqu %xmm4, (%edx) + movdqu %xmm5, 16(%edx) + movdqu %xmm6, 32(%edx) + movdqu %xmm7, 48(%edx) + POP (%ebx) + jmp L(mm_return_pop_all) + +L(check_forward): + add %edx, %ecx + cmp %eax, %ecx + movl LEN(%esp), %ecx + jle L(forward) + +/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] + separately. */ + cmp $16, %ecx + jbe L(mm_len_0_16_bytes_forward) + + cmpl $32, %ecx + ja L(mm_len_32_or_more_forward) + +/* Copy [0..32] and return. */ + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_32_or_more_forward): + cmpl $64, %ecx + ja L(mm_len_64_or_more_forward) + +/* Copy [0..64] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu -16(%eax, %ecx), %xmm2 + movdqu -32(%eax, %ecx), %xmm3 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, -16(%edx, %ecx) + movdqu %xmm3, -32(%edx, %ecx) + jmp L(return) + +L(mm_len_64_or_more_forward): + cmpl $128, %ecx + ja L(mm_len_128_or_more_forward) + +/* Copy [0..128] and return. */ + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + jmp L(return) + +L(mm_len_128_or_more_forward): + PUSH (%esi) + PUSH (%edi) + PUSH (%ebx) + +/* Aligning the address of destination. */ + movdqu -16(%eax, %ecx), %xmm4 + movdqu -32(%eax, %ecx), %xmm5 + movdqu -48(%eax, %ecx), %xmm6 + movdqu -64(%eax, %ecx), %xmm7 + leal (%edx, %ecx), %esi + movdqu (%eax), %xmm0 + subl $16, %esp + movdqu %xmm0, (%esp) + mov %ecx, %edi + leal 16(%edx), %ecx + andl $-16, %ecx + movl %ecx, %ebx + subl %edx, %ebx + addl %ebx, %eax + movl %esi, %ebx + subl %ecx, %ebx + shrl $6, %ebx + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %edi +# else +# ifdef SHARED + PUSH (%ebx) + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %edi + POP (%ebx) +# else + cmp __x86_shared_cache_size_half, %edi +# endif +# endif + jae L(mm_large_page_loop_forward) + + .p2align 4 +L(mm_main_loop_forward): + + prefetcht0 128(%eax) + + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqa %xmm0, (%ecx) + addl $64, %eax + movaps %xmm1, 16(%ecx) + movaps %xmm2, 32(%ecx) + movaps %xmm3, 48(%ecx) + addl $64, %ecx + sub $1, %ebx + jnz L(mm_main_loop_forward) + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, (%edx) + movdqu %xmm4, -16(%esi) + movdqu %xmm5, -32(%esi) + movdqu %xmm6, -48(%esi) + movdqu %xmm7, -64(%esi) + POP (%ebx) + jmp L(mm_return_pop_all) + +L(mm_len_0_16_bytes_forward): + testb $24, %cl + jne L(mm_len_9_16_bytes_forward) + testb $4, %cl + .p2align 4,,5 + jne L(mm_len_5_8_bytes_forward) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + testb $2, %cl + .p2align 4,,1 + jne L(mm_len_2_4_bytes_forward) + movzbl -1(%eax,%ecx), %ebx + movzbl (%eax), %eax + movb %bl, -1(%edx,%ecx) + movb %al, (%edx) + jmp L(return) + +L(mm_len_2_4_bytes_forward): + movzwl -2(%eax,%ecx), %ebx + movzwl (%eax), %eax + movw %bx, -2(%edx,%ecx) + movw %ax, (%edx) + jmp L(return) + +L(mm_len_5_8_bytes_forward): + movl (%eax), %ebx + movl -4(%eax,%ecx), %eax + movl %ebx, (%edx) + movl %eax, -4(%edx,%ecx) + jmp L(return) + +L(mm_len_9_16_bytes_forward): + movq (%eax), %xmm0 + movq -8(%eax, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(return) + +L(mm_return_pop_all): + movl %edx, %eax + POP (%edi) + POP (%esi) + RETURN + +/* Big length copy forward part. */ + .p2align 4 +L(mm_large_page_loop_forward): + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movntdq %xmm0, (%ecx) + addl $64, %eax + movntdq %xmm1, 16(%ecx) + movntdq %xmm2, 32(%ecx) + movntdq %xmm3, 48(%ecx) + addl $64, %ecx + sub $1, %ebx + jnz L(mm_large_page_loop_forward) + sfence + movdqu (%esp), %xmm0 + addl $16, %esp + movdqu %xmm0, (%edx) + movdqu %xmm4, -16(%esi) + movdqu %xmm5, -32(%esi) + movdqu %xmm6, -48(%esi) + movdqu %xmm7, -64(%esi) + POP (%ebx) + jmp L(mm_return_pop_all) +# endif + +L(forward): + cmp $16, %ecx + jbe L(len_0_16_bytes) + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +# endif + jae L(large_page) + + movdqu (%eax), %xmm0 + movdqu -16(%eax, %ecx), %xmm1 + cmpl $32, %ecx + movdqu %xmm0, (%edx) + movdqu %xmm1, -16(%edx, %ecx) + jbe L(return) + + movdqu 16(%eax), %xmm0 + movdqu -32(%eax, %ecx), %xmm1 + cmpl $64, %ecx + movdqu %xmm0, 16(%edx) + movdqu %xmm1, -32(%edx, %ecx) + jbe L(return) + + movdqu 32(%eax), %xmm0 + movdqu 48(%eax), %xmm1 + movdqu -48(%eax, %ecx), %xmm2 + movdqu -64(%eax, %ecx), %xmm3 + cmpl $128, %ecx + movdqu %xmm0, 32(%edx) + movdqu %xmm1, 48(%edx) + movdqu %xmm2, -48(%edx, %ecx) + movdqu %xmm3, -64(%edx, %ecx) + jbe L(return) + +/* Now the main loop: we align the address of the destination. */ + leal 64(%edx), %ebx + andl $-64, %ebx + + addl %edx, %ecx + andl $-64, %ecx + + subl %edx, %eax + +/* We should stop two iterations before the termination + (in order not to misprefetch). */ + subl $64, %ecx + cmpl %ebx, %ecx + je L(main_loop_just_one_iteration) + + subl $64, %ecx + cmpl %ebx, %ecx + je L(main_loop_last_two_iterations) + + .p2align 4 +L(main_loop_cache): + + prefetcht0 128(%ebx, %eax) + + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + lea 64(%ebx), %ebx + cmpl %ebx, %ecx + jne L(main_loop_cache) + +L(main_loop_last_two_iterations): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqu 64(%ebx, %eax), %xmm4 + movdqu 80(%ebx, %eax), %xmm5 + movdqu 96(%ebx, %eax), %xmm6 + movdqu 112(%ebx, %eax), %xmm7 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + movaps %xmm4, 64(%ebx) + movaps %xmm5, 80(%ebx) + movaps %xmm6, 96(%ebx) + movaps %xmm7, 112(%ebx) + jmp L(return) + +L(main_loop_just_one_iteration): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqa %xmm0, (%ebx) + movaps %xmm1, 16(%ebx) + movaps %xmm2, 32(%ebx) + movaps %xmm3, 48(%ebx) + jmp L(return) + +L(large_page): + movdqu (%eax), %xmm0 + movdqu 16(%eax), %xmm1 + movdqu 32(%eax), %xmm2 + movdqu 48(%eax), %xmm3 + movdqu -64(%eax, %ecx), %xmm4 + movdqu -48(%eax, %ecx), %xmm5 + movdqu -32(%eax, %ecx), %xmm6 + movdqu -16(%eax, %ecx), %xmm7 + movdqu %xmm0, (%edx) + movdqu %xmm1, 16(%edx) + movdqu %xmm2, 32(%edx) + movdqu %xmm3, 48(%edx) + movdqu %xmm4, -64(%edx, %ecx) + movdqu %xmm5, -48(%edx, %ecx) + movdqu %xmm6, -32(%edx, %ecx) + movdqu %xmm7, -16(%edx, %ecx) + + movdqu 64(%eax), %xmm0 + movdqu 80(%eax), %xmm1 + movdqu 96(%eax), %xmm2 + movdqu 112(%eax), %xmm3 + movdqu -128(%eax, %ecx), %xmm4 + movdqu -112(%eax, %ecx), %xmm5 + movdqu -96(%eax, %ecx), %xmm6 + movdqu -80(%eax, %ecx), %xmm7 + movdqu %xmm0, 64(%edx) + movdqu %xmm1, 80(%edx) + movdqu %xmm2, 96(%edx) + movdqu %xmm3, 112(%edx) + movdqu %xmm4, -128(%edx, %ecx) + movdqu %xmm5, -112(%edx, %ecx) + movdqu %xmm6, -96(%edx, %ecx) + movdqu %xmm7, -80(%edx, %ecx) + +/* Now the main loop with non temporal stores. We align + the address of the destination. */ + leal 128(%edx), %ebx + andl $-128, %ebx + + addl %edx, %ecx + andl $-128, %ecx + + subl %edx, %eax + + .p2align 4 +L(main_loop_large_page): + movdqu (%ebx, %eax), %xmm0 + movdqu 16(%ebx, %eax), %xmm1 + movdqu 32(%ebx, %eax), %xmm2 + movdqu 48(%ebx, %eax), %xmm3 + movdqu 64(%ebx, %eax), %xmm4 + movdqu 80(%ebx, %eax), %xmm5 + movdqu 96(%ebx, %eax), %xmm6 + movdqu 112(%ebx, %eax), %xmm7 + movntdq %xmm0, (%ebx) + movntdq %xmm1, 16(%ebx) + movntdq %xmm2, 32(%ebx) + movntdq %xmm3, 48(%ebx) + movntdq %xmm4, 64(%ebx) + movntdq %xmm5, 80(%ebx) + movntdq %xmm6, 96(%ebx) + movntdq %xmm7, 112(%ebx) + lea 128(%ebx), %ebx + cmpl %ebx, %ecx + jne L(main_loop_large_page) + sfence + jmp L(return) + +L(len_0_16_bytes): + testb $24, %cl + jne L(len_9_16_bytes) + testb $4, %cl + .p2align 4,,5 + jne L(len_5_8_bytes) + testl %ecx, %ecx + .p2align 4,,2 + je L(return) + movzbl (%eax), %ebx + testb $2, %cl + movb %bl, (%edx) + je L(return) + movzwl -2(%eax,%ecx), %ebx + movw %bx, -2(%edx,%ecx) + jmp L(return) + +L(len_9_16_bytes): + movq (%eax), %xmm0 + movq -8(%eax, %ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx, %ecx) + jmp L(return) + +L(len_5_8_bytes): + movl (%eax), %ebx + movl %ebx, (%edx) + movl -4(%eax,%ecx), %ebx + movl %ebx, -4(%edx,%ecx) + +L(return): + movl %edx, %eax +# if !defined USE_AS_BCOPY && defined USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif + RETURN + +END (MEMCPY) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S new file mode 100644 index 0000000000..687e083147 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S @@ -0,0 +1,1809 @@ +/* memcpy with SSSE3 and REP string. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3_rep +# define MEMCPY_CHK __memcpy_chk_ssse3_rep +#endif + +#ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +#else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ + addl $(TABLE - .), %ebx + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + addl (%ebx,INDEX,SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +#else +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) + +# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) + +# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +#endif + + .section .text.ssse3,"ax",@progbits +#if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +#ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $48, %ecx + jb L(bk_write_less48bytes) + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +#endif + cmp $48, %ecx + jae L(48bytesormore) + +L(fwd_write_less32bytes): +#ifndef USE_AS_MEMMOVE + cmp %dl, %al + jb L(bk_write) +#endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +#ifndef USE_AS_MEMMOVE +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +#endif + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(48bytesormore): + movdqu (%eax), %xmm0 + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + PUSH (%esi) + cfi_remember_state + add $16, %edx + movl %edi, %esi + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +#ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +#endif + + mov %eax, %edi + jae L(large_page) + and $0xf, %edi + jz L(shl_0) + + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + ALIGN (4) +L(shl_0): + movdqu %xmm0, (%esi) + xor %edi, %edi + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state +L(shl_0_gobble): + +#ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_data_cache_size_half@GOTOFF(%ebx), %edi +# else + mov __x86_data_cache_size_half, %edi +# endif +#endif + mov %edi, %esi + shr $3, %esi + sub %esi, %edi + cmp %edi, %ecx + jae L(shl_0_gobble_mem_start) + sub $128, %ecx + ALIGN (4) +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movaps 0x10(%eax), %xmm1 + movaps 0x20(%eax), %xmm2 + movaps 0x30(%eax), %xmm3 + movaps 0x40(%eax), %xmm4 + movaps 0x50(%eax), %xmm5 + movaps 0x60(%eax), %xmm6 + movaps 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm1, 0x10(%edx) + movaps %xmm2, 0x20(%edx) + movaps %xmm3, 0x30(%edx) + movaps %xmm4, 0x40(%edx) + movaps %xmm5, 0x50(%edx) + movaps %xmm6, 0x60(%edx) + movaps %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_cache_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_0_gobble_mem_start): + cmp %al, %dl + je L(copy_page_by_rep) + sub $128, %ecx +L(shl_0_gobble_mem_loop): + prefetchnta 0x1c0(%eax) + prefetchnta 0x280(%eax) + prefetchnta 0x1c0(%edx) + prefetchnta 0x280(%edx) + + movdqa (%eax), %xmm0 + movaps 0x10(%eax), %xmm1 + movaps 0x20(%eax), %xmm2 + movaps 0x30(%eax), %xmm3 + movaps 0x40(%eax), %xmm4 + movaps 0x50(%eax), %xmm5 + movaps 0x60(%eax), %xmm6 + movaps 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movaps %xmm1, 0x10(%edx) + movaps %xmm2, 0x20(%edx) + movaps %xmm3, 0x30(%edx) + movaps %xmm4, 0x40(%edx) + movaps %xmm5, 0x50(%edx) + movaps %xmm6, 0x60(%edx) + movaps %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_mem_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + POP (%esi) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_1): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $1, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_1_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_1_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_1_loop) + +L(shl_1_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_2): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $2, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_2_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_2_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_2_loop) + +L(shl_2_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_3): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $3, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_3_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_3_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_3_loop) + +L(shl_3_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_4): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $4, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_4_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_4_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_4_loop) + +L(shl_4_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_5): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $5, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_5_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_5_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_5_loop) + +L(shl_5_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_6): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $6, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_6_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_6_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_6_loop) + +L(shl_6_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_7): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $7, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_7_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_7_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_7_loop) + +L(shl_7_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_8): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $8, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_8_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_8_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_8_loop) + +L(shl_8_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_9): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $9, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_9_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_9_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_9_loop) + +L(shl_9_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_10): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $10, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_10_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_10_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_10_loop) + +L(shl_10_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_11): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $11, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_11_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_11_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_11_loop) + +L(shl_11_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_12): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $12, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_12_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_12_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_12_loop) + +L(shl_12_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_13): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $13, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_13_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_13_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_13_loop) + +L(shl_13_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_14): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $14, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_14_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_14_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_14_loop) + +L(shl_14_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(shl_15): + BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) + sub $15, %eax + movaps (%eax), %xmm1 + xor %edi, %edi + sub $32, %ecx + movdqu %xmm0, (%esi) + POP (%esi) +L(shl_15_loop): + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(shl_15_end) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(shl_15_loop) + +L(shl_15_end): + add $32, %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + + + ALIGN (4) +L(fwd_write_44bytes): + movl -44(%eax), %ecx + movl %ecx, -44(%edx) +L(fwd_write_40bytes): + movl -40(%eax), %ecx + movl %ecx, -40(%edx) +L(fwd_write_36bytes): + movl -36(%eax), %ecx + movl %ecx, -36(%edx) +L(fwd_write_32bytes): + movl -32(%eax), %ecx + movl %ecx, -32(%edx) +L(fwd_write_28bytes): + movl -28(%eax), %ecx + movl %ecx, -28(%edx) +L(fwd_write_24bytes): + movl -24(%eax), %ecx + movl %ecx, -24(%edx) +L(fwd_write_20bytes): + movl -20(%eax), %ecx + movl %ecx, -20(%edx) +L(fwd_write_16bytes): + movl -16(%eax), %ecx + movl %ecx, -16(%edx) +L(fwd_write_12bytes): + movl -12(%eax), %ecx + movl %ecx, -12(%edx) +L(fwd_write_8bytes): + movl -8(%eax), %ecx + movl %ecx, -8(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +L(fwd_write_0bytes): +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_45bytes): + movl -45(%eax), %ecx + movl %ecx, -45(%edx) +L(fwd_write_41bytes): + movl -41(%eax), %ecx + movl %ecx, -41(%edx) +L(fwd_write_37bytes): + movl -37(%eax), %ecx + movl %ecx, -37(%edx) +L(fwd_write_33bytes): + movl -33(%eax), %ecx + movl %ecx, -33(%edx) +L(fwd_write_29bytes): + movl -29(%eax), %ecx + movl %ecx, -29(%edx) +L(fwd_write_25bytes): + movl -25(%eax), %ecx + movl %ecx, -25(%edx) +L(fwd_write_21bytes): + movl -21(%eax), %ecx + movl %ecx, -21(%edx) +L(fwd_write_17bytes): + movl -17(%eax), %ecx + movl %ecx, -17(%edx) +L(fwd_write_13bytes): + movl -13(%eax), %ecx + movl %ecx, -13(%edx) +L(fwd_write_9bytes): + movl -9(%eax), %ecx + movl %ecx, -9(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_46bytes): + movl -46(%eax), %ecx + movl %ecx, -46(%edx) +L(fwd_write_42bytes): + movl -42(%eax), %ecx + movl %ecx, -42(%edx) +L(fwd_write_38bytes): + movl -38(%eax), %ecx + movl %ecx, -38(%edx) +L(fwd_write_34bytes): + movl -34(%eax), %ecx + movl %ecx, -34(%edx) +L(fwd_write_30bytes): + movl -30(%eax), %ecx + movl %ecx, -30(%edx) +L(fwd_write_26bytes): + movl -26(%eax), %ecx + movl %ecx, -26(%edx) +L(fwd_write_22bytes): + movl -22(%eax), %ecx + movl %ecx, -22(%edx) +L(fwd_write_18bytes): + movl -18(%eax), %ecx + movl %ecx, -18(%edx) +L(fwd_write_14bytes): + movl -14(%eax), %ecx + movl %ecx, -14(%edx) +L(fwd_write_10bytes): + movl -10(%eax), %ecx + movl %ecx, -10(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN + + ALIGN (4) +L(fwd_write_47bytes): + movl -47(%eax), %ecx + movl %ecx, -47(%edx) +L(fwd_write_43bytes): + movl -43(%eax), %ecx + movl %ecx, -43(%edx) +L(fwd_write_39bytes): + movl -39(%eax), %ecx + movl %ecx, -39(%edx) +L(fwd_write_35bytes): + movl -35(%eax), %ecx + movl %ecx, -35(%edx) +L(fwd_write_31bytes): + movl -31(%eax), %ecx + movl %ecx, -31(%edx) +L(fwd_write_27bytes): + movl -27(%eax), %ecx + movl %ecx, -27(%edx) +L(fwd_write_23bytes): + movl -23(%eax), %ecx + movl %ecx, -23(%edx) +L(fwd_write_19bytes): + movl -19(%eax), %ecx + movl %ecx, -19(%edx) +L(fwd_write_15bytes): + movl -15(%eax), %ecx + movl %ecx, -15(%edx) +L(fwd_write_11bytes): + movl -11(%eax), %ecx + movl %ecx, -11(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +#ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +#endif + RETURN_END + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(large_page): + movdqu (%eax), %xmm1 + movdqu %xmm0, (%esi) + movntdq %xmm1, (%edx) + add $0x10, %eax + add $0x10, %edx + sub $0x10, %ecx + cmp %al, %dl + je L(copy_page_by_rep) +L(large_page_loop_init): + POP (%esi) + sub $0x80, %ecx + POP (%edi) +L(large_page_loop): + prefetchnta 0x1c0(%eax) + prefetchnta 0x280(%eax) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + lfence + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jb L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + cfi_restore_state + cfi_remember_state + ALIGN (4) +L(copy_page_by_rep): + mov %eax, %esi + mov %edx, %edi + mov %ecx, %edx + shr $2, %ecx + and $3, %edx + rep movsl + jz L(copy_page_by_rep_exit) + cmp $2, %edx + jb L(copy_page_by_rep_left_1) + movzwl (%esi), %eax + movw %ax, (%edi) + add $2, %esi + add $2, %edi + sub $2, %edx + jz L(copy_page_by_rep_exit) +L(copy_page_by_rep_left_1): + movzbl (%esi), %eax + movb %al, (%edi) +L(copy_page_by_rep_exit): + POP (%esi) + POP (%edi) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_44bytes): + movl 40(%eax), %ecx + movl %ecx, 40(%edx) +L(bk_write_40bytes): + movl 36(%eax), %ecx + movl %ecx, 36(%edx) +L(bk_write_36bytes): + movl 32(%eax), %ecx + movl %ecx, 32(%edx) +L(bk_write_32bytes): + movl 28(%eax), %ecx + movl %ecx, 28(%edx) +L(bk_write_28bytes): + movl 24(%eax), %ecx + movl %ecx, 24(%edx) +L(bk_write_24bytes): + movl 20(%eax), %ecx + movl %ecx, 20(%edx) +L(bk_write_20bytes): + movl 16(%eax), %ecx + movl %ecx, 16(%edx) +L(bk_write_16bytes): + movl 12(%eax), %ecx + movl %ecx, 12(%edx) +L(bk_write_12bytes): + movl 8(%eax), %ecx + movl %ecx, 8(%edx) +L(bk_write_8bytes): + movl 4(%eax), %ecx + movl %ecx, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_45bytes): + movl 41(%eax), %ecx + movl %ecx, 41(%edx) +L(bk_write_41bytes): + movl 37(%eax), %ecx + movl %ecx, 37(%edx) +L(bk_write_37bytes): + movl 33(%eax), %ecx + movl %ecx, 33(%edx) +L(bk_write_33bytes): + movl 29(%eax), %ecx + movl %ecx, 29(%edx) +L(bk_write_29bytes): + movl 25(%eax), %ecx + movl %ecx, 25(%edx) +L(bk_write_25bytes): + movl 21(%eax), %ecx + movl %ecx, 21(%edx) +L(bk_write_21bytes): + movl 17(%eax), %ecx + movl %ecx, 17(%edx) +L(bk_write_17bytes): + movl 13(%eax), %ecx + movl %ecx, 13(%edx) +L(bk_write_13bytes): + movl 9(%eax), %ecx + movl %ecx, 9(%edx) +L(bk_write_9bytes): + movl 5(%eax), %ecx + movl %ecx, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_46bytes): + movl 42(%eax), %ecx + movl %ecx, 42(%edx) +L(bk_write_42bytes): + movl 38(%eax), %ecx + movl %ecx, 38(%edx) +L(bk_write_38bytes): + movl 34(%eax), %ecx + movl %ecx, 34(%edx) +L(bk_write_34bytes): + movl 30(%eax), %ecx + movl %ecx, 30(%edx) +L(bk_write_30bytes): + movl 26(%eax), %ecx + movl %ecx, 26(%edx) +L(bk_write_26bytes): + movl 22(%eax), %ecx + movl %ecx, 22(%edx) +L(bk_write_22bytes): + movl 18(%eax), %ecx + movl %ecx, 18(%edx) +L(bk_write_18bytes): + movl 14(%eax), %ecx + movl %ecx, 14(%edx) +L(bk_write_14bytes): + movl 10(%eax), %ecx + movl %ecx, 10(%edx) +L(bk_write_10bytes): + movl 6(%eax), %ecx + movl %ecx, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN + + ALIGN (4) +L(bk_write_47bytes): + movl 43(%eax), %ecx + movl %ecx, 43(%edx) +L(bk_write_43bytes): + movl 39(%eax), %ecx + movl %ecx, 39(%edx) +L(bk_write_39bytes): + movl 35(%eax), %ecx + movl %ecx, 35(%edx) +L(bk_write_35bytes): + movl 31(%eax), %ecx + movl %ecx, 31(%edx) +L(bk_write_31bytes): + movl 27(%eax), %ecx + movl %ecx, 27(%edx) +L(bk_write_27bytes): + movl 23(%eax), %ecx + movl %ecx, 23(%edx) +L(bk_write_23bytes): + movl 19(%eax), %ecx + movl %ecx, 19(%edx) +L(bk_write_19bytes): + movl 15(%eax), %ecx + movl %ecx, 15(%edx) +L(bk_write_15bytes): + movl 11(%eax), %ecx + movl %ecx, 11(%edx) +L(bk_write_11bytes): + movl 7(%eax), %ecx + movl %ecx, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +#ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +#endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + ALIGN (2) +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + ALIGN (2) +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + ALIGN (2) +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +#ifdef USE_AS_MEMMOVE + ALIGN (4) +L(copy_backward): + PUSH (%esi) + movl %eax, %esi + add %ecx, %edx + add %ecx, %esi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jae L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jb L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movl -4(%esi), %eax + movl %eax, -4(%edx) + movl -8(%esi), %eax + movl %eax, -8(%edx) + movl -12(%esi), %eax + movl %eax, -12(%edx) + movl -16(%esi), %eax + movl %eax, -16(%edx) + movl -20(%esi), %eax + movl %eax, -20(%edx) + movl -24(%esi), %eax + movl %eax, -24(%edx) + movl -28(%esi), %eax + movl %eax, -28(%edx) + movl -32(%esi), %eax + movl %eax, -32(%edx) + sub $32, %edx + sub $32, %esi + +L(bk_write_less32bytes): + movl %esi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%esi) +L(bk_write_less48bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + CFI_PUSH (%esi) + ALIGN (4) +L(bk_align): + cmp $8, %ecx + jbe L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %esi + sub $1, %ecx + sub $1, %edx + movzbl (%esi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %esi + sub $2, %ecx + sub $2, %edx + movzwl (%esi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + ALIGN (4) +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %esi + sub $4, %ecx + sub $4, %edx + movl (%esi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jb L(bk_write_more32bytes) + +L(bk_ssse3_cpy): + sub $64, %esi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%esi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%esi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%esi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%esi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jae L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +#endif + +END (MEMCPY) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S new file mode 100644 index 0000000000..53e8a6ca1d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy-ssse3.S @@ -0,0 +1,3162 @@ +/* memcpy with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +# include <sysdep.h> +# include "asm-syntax.h" + +# ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# endif + +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx, INDEX, SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +# else + +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(, INDEX, SCALE) +# endif + + .section .text.ssse3,"ax",@progbits +# if !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +# endif +ENTRY (MEMCPY) + ENTRANCE + movl LEN(%esp), %ecx + movl SRC(%esp), %eax + movl DEST(%esp), %edx + +# ifdef USE_AS_MEMMOVE + cmp %eax, %edx + jb L(copy_forward) + je L(fwd_write_0bytes) + cmp $32, %ecx + jae L(memmove_bwd) + jmp L(bk_write_less32bytes_2) + + .p2align 4 +L(memmove_bwd): + add %ecx, %eax + cmp %eax, %edx + movl SRC(%esp), %eax + jb L(copy_backward) + +L(copy_forward): +# endif + cmp $48, %ecx + jae L(48bytesormore) + +L(fwd_write_less32bytes): +# ifndef USE_AS_MEMMOVE + cmp %dl, %al + jb L(bk_write) +# endif + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) +# ifndef USE_AS_MEMMOVE + .p2align 4 +L(bk_write): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) +# endif + + .p2align 4 +L(48bytesormore): +# ifndef USE_AS_MEMMOVE + movlpd (%eax), %xmm0 + movlpd 8(%eax), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) +# else + movdqu (%eax), %xmm0 +# endif + PUSH (%edi) + movl %edx, %edi + and $-16, %edx + add $16, %edx + sub %edx, %edi + add %edi, %ecx + sub %edi, %eax + +# ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_shared_cache_size_half, %ecx +# endif +# endif + + mov %eax, %edi + jae L(large_page) + and $0xf, %edi + jz L(shl_0) + BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + + .p2align 4 +L(shl_0): +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif + xor %edi, %edi + cmp $127, %ecx + ja L(shl_0_gobble) + lea -32(%ecx), %ecx + + .p2align 4 +L(shl_0_loop): + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + jb L(shl_0_end) + + movdqa (%eax, %edi), %xmm0 + movdqa 16(%eax, %edi), %xmm1 + sub $32, %ecx + movdqa %xmm0, (%edx, %edi) + movdqa %xmm1, 16(%edx, %edi) + lea 32(%edi), %edi + +L(shl_0_end): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + add %edi, %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_0_gobble): +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + POP (%edi) + lea -128(%ecx), %ecx + jae L(shl_0_gobble_mem_loop) + + .p2align 4 +L(shl_0_gobble_cache_loop): + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_cache_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_cache_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx + +L(shl_0_cache_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_cache_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx + +L(shl_0_cache_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_cache_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx + +L(shl_0_cache_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(shl_0_gobble_mem_loop): + prefetcht0 0x1c0(%eax) + prefetcht0 0x280(%eax) + prefetcht0 0x1c0(%edx) + + movdqa (%eax), %xmm0 + movdqa 0x10(%eax), %xmm1 + movdqa 0x20(%eax), %xmm2 + movdqa 0x30(%eax), %xmm3 + movdqa 0x40(%eax), %xmm4 + movdqa 0x50(%eax), %xmm5 + movdqa 0x60(%eax), %xmm6 + movdqa 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + sub $0x80, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + movdqa %xmm2, 0x20(%edx) + movdqa %xmm3, 0x30(%edx) + movdqa %xmm4, 0x40(%edx) + movdqa %xmm5, 0x50(%edx) + movdqa %xmm6, 0x60(%edx) + movdqa %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + + jae L(shl_0_gobble_mem_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(shl_0_mem_less_64bytes) + + movdqa (%eax), %xmm0 + sub $0x40, %ecx + movdqa 0x10(%eax), %xmm1 + + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + + movdqa 0x20(%eax), %xmm0 + movdqa 0x30(%eax), %xmm1 + add $0x40, %eax + + movdqa %xmm0, 0x20(%edx) + movdqa %xmm1, 0x30(%edx) + add $0x40, %edx + +L(shl_0_mem_less_64bytes): + cmp $0x20, %ecx + jb L(shl_0_mem_less_32bytes) + movdqa (%eax), %xmm0 + sub $0x20, %ecx + movdqa 0x10(%eax), %xmm1 + add $0x20, %eax + movdqa %xmm0, (%edx) + movdqa %xmm1, 0x10(%edx) + add $0x20, %edx + +L(shl_0_mem_less_32bytes): + cmp $0x10, %ecx + jb L(shl_0_mem_less_16bytes) + sub $0x10, %ecx + movdqa (%eax), %xmm0 + add $0x10, %eax + movdqa %xmm0, (%edx) + add $0x10, %edx + +L(shl_0_mem_less_16bytes): + add %ecx, %edx + add %ecx, %eax + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) + + .p2align 4 +L(shl_1): +# ifndef USE_AS_MEMMOVE + movaps -1(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -1(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_1_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl1LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + movaps 47(%eax), %xmm4 + movaps 63(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + palignr $1, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $1, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $1, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl1LoopStart) + +L(Shl1LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_1_no_prefetch): + lea -32(%ecx), %ecx + lea -1(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_1_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_1_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_1_no_prefetch_loop) + +L(sh_1_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 1(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_2): +# ifndef USE_AS_MEMMOVE + movaps -2(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -2(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_2_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl2LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + movaps 46(%eax), %xmm4 + movaps 62(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + palignr $2, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $2, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $2, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl2LoopStart) + +L(Shl2LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_2_no_prefetch): + lea -32(%ecx), %ecx + lea -2(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_2_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_2_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_2_no_prefetch_loop) + +L(sh_2_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 2(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_3): +# ifndef USE_AS_MEMMOVE + movaps -3(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -3(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_3_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl3LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + movaps 45(%eax), %xmm4 + movaps 61(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + palignr $3, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $3, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $3, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl3LoopStart) + +L(Shl3LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_3_no_prefetch): + lea -32(%ecx), %ecx + lea -3(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_3_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_3_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_3_no_prefetch_loop) + +L(sh_3_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 3(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_4): +# ifndef USE_AS_MEMMOVE + movaps -4(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -4(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_4_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl4LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + movaps 44(%eax), %xmm4 + movaps 60(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + palignr $4, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $4, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $4, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl4LoopStart) + +L(Shl4LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_4_no_prefetch): + lea -32(%ecx), %ecx + lea -4(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_4_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_4_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_4_no_prefetch_loop) + +L(sh_4_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 4(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_5): +# ifndef USE_AS_MEMMOVE + movaps -5(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -5(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_5_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl5LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + movaps 43(%eax), %xmm4 + movaps 59(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + palignr $5, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $5, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $5, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl5LoopStart) + +L(Shl5LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 11(%eax), %xmm2 + movaps 27(%eax), %xmm3 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_5_no_prefetch): + lea -32(%ecx), %ecx + lea -5(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_5_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_5_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $5, %xmm2, %xmm3 + palignr $5, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_5_no_prefetch_loop) + +L(sh_5_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 5(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_6): +# ifndef USE_AS_MEMMOVE + movaps -6(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -6(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_6_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl6LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + movaps 42(%eax), %xmm4 + movaps 58(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + palignr $6, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $6, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $6, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl6LoopStart) + +L(Shl6LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 10(%eax), %xmm2 + movaps 26(%eax), %xmm3 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_6_no_prefetch): + lea -32(%ecx), %ecx + lea -6(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_6_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jb L(sh_6_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $6, %xmm2, %xmm3 + palignr $6, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + + jae L(sh_6_no_prefetch_loop) + +L(sh_6_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 6(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_7): +# ifndef USE_AS_MEMMOVE + movaps -7(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -7(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_7_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl7LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + movaps 41(%eax), %xmm4 + movaps 57(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + palignr $7, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $7, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $7, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl7LoopStart) + +L(Shl7LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 9(%eax), %xmm2 + movaps 25(%eax), %xmm3 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_7_no_prefetch): + lea -32(%ecx), %ecx + lea -7(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_7_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_7_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $7, %xmm2, %xmm3 + palignr $7, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_7_no_prefetch_loop) + +L(sh_7_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 7(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_8): +# ifndef USE_AS_MEMMOVE + movaps -8(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -8(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_8_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl8LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + movaps 40(%eax), %xmm4 + movaps 56(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + palignr $8, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $8, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $8, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl8LoopStart) + +L(LoopLeave8): + add $32, %ecx + jle L(shl_end_0) + + movaps 8(%eax), %xmm2 + movaps 24(%eax), %xmm3 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_8_no_prefetch): + lea -32(%ecx), %ecx + lea -8(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_8_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_8_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $8, %xmm2, %xmm3 + palignr $8, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_8_no_prefetch_loop) + +L(sh_8_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 8(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_9): +# ifndef USE_AS_MEMMOVE + movaps -9(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -9(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_9_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl9LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + movaps 39(%eax), %xmm4 + movaps 55(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + palignr $9, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $9, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $9, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl9LoopStart) + +L(Shl9LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 7(%eax), %xmm2 + movaps 23(%eax), %xmm3 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_9_no_prefetch): + lea -32(%ecx), %ecx + lea -9(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_9_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_9_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $9, %xmm2, %xmm3 + palignr $9, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_9_no_prefetch_loop) + +L(sh_9_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 9(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_10): +# ifndef USE_AS_MEMMOVE + movaps -10(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -10(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_10_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl10LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + movaps 38(%eax), %xmm4 + movaps 54(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + palignr $10, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $10, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $10, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl10LoopStart) + +L(Shl10LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 6(%eax), %xmm2 + movaps 22(%eax), %xmm3 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_10_no_prefetch): + lea -32(%ecx), %ecx + lea -10(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_10_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_10_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $10, %xmm2, %xmm3 + palignr $10, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_10_no_prefetch_loop) + +L(sh_10_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 10(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_11): +# ifndef USE_AS_MEMMOVE + movaps -11(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -11(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_11_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl11LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + movaps 37(%eax), %xmm4 + movaps 53(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + palignr $11, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $11, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $11, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl11LoopStart) + +L(Shl11LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 5(%eax), %xmm2 + movaps 21(%eax), %xmm3 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_11_no_prefetch): + lea -32(%ecx), %ecx + lea -11(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_11_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_11_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $11, %xmm2, %xmm3 + palignr $11, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_11_no_prefetch_loop) + +L(sh_11_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 11(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_12): +# ifndef USE_AS_MEMMOVE + movaps -12(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -12(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_12_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl12LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + movaps 36(%eax), %xmm4 + movaps 52(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + palignr $12, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $12, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $12, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl12LoopStart) + +L(Shl12LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 4(%eax), %xmm2 + movaps 20(%eax), %xmm3 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_12_no_prefetch): + lea -32(%ecx), %ecx + lea -12(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_12_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_12_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $12, %xmm2, %xmm3 + palignr $12, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_12_no_prefetch_loop) + +L(sh_12_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 12(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_13): +# ifndef USE_AS_MEMMOVE + movaps -13(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -13(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_13_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl13LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + movaps 35(%eax), %xmm4 + movaps 51(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + palignr $13, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $13, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $13, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl13LoopStart) + +L(Shl13LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 3(%eax), %xmm2 + movaps 19(%eax), %xmm3 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_13_no_prefetch): + lea -32(%ecx), %ecx + lea -13(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_13_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_13_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $13, %xmm2, %xmm3 + palignr $13, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_13_no_prefetch_loop) + +L(sh_13_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 13(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_14): +# ifndef USE_AS_MEMMOVE + movaps -14(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -14(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_14_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl14LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + movaps 34(%eax), %xmm4 + movaps 50(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + palignr $14, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $14, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $14, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl14LoopStart) + +L(Shl14LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 2(%eax), %xmm2 + movaps 18(%eax), %xmm3 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_14_no_prefetch): + lea -32(%ecx), %ecx + lea -14(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_14_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_14_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $14, %xmm2, %xmm3 + palignr $14, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_14_no_prefetch_loop) + +L(sh_14_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 14(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_15): +# ifndef USE_AS_MEMMOVE + movaps -15(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -15(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_15_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl15LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + movaps 33(%eax), %xmm4 + movaps 49(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + palignr $15, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $15, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $15, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl15LoopStart) + +L(Shl15LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 1(%eax), %xmm2 + movaps 17(%eax), %xmm3 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_15_no_prefetch): + lea -32(%ecx), %ecx + lea -15(%eax), %eax + xor %edi, %edi + + .p2align 4 +L(sh_15_no_prefetch_loop): + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm4 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm1, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jb L(sh_15_end_no_prefetch_loop) + + movdqa 16(%eax, %edi), %xmm2 + sub $32, %ecx + movdqa 32(%eax, %edi), %xmm3 + movdqa %xmm3, %xmm1 + palignr $15, %xmm2, %xmm3 + palignr $15, %xmm4, %xmm2 + lea 32(%edi), %edi + movdqa %xmm2, -32(%edx, %edi) + movdqa %xmm3, -16(%edx, %edi) + jae L(sh_15_no_prefetch_loop) + +L(sh_15_end_no_prefetch_loop): + lea 32(%ecx), %ecx + add %ecx, %edi + add %edi, %edx + lea 15(%edi, %eax), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(shl_end_0): + lea 32(%ecx), %ecx + lea (%edx, %ecx), %edx + lea (%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(fwd_write_44bytes): + movq -44(%eax), %xmm0 + movq %xmm0, -44(%edx) +L(fwd_write_36bytes): + movq -36(%eax), %xmm0 + movq %xmm0, -36(%edx) +L(fwd_write_28bytes): + movq -28(%eax), %xmm0 + movq %xmm0, -28(%edx) +L(fwd_write_20bytes): + movq -20(%eax), %xmm0 + movq %xmm0, -20(%edx) +L(fwd_write_12bytes): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_40bytes): + movq -40(%eax), %xmm0 + movq %xmm0, -40(%edx) +L(fwd_write_32bytes): + movq -32(%eax), %xmm0 + movq %xmm0, -32(%edx) +L(fwd_write_24bytes): + movq -24(%eax), %xmm0 + movq %xmm0, -24(%edx) +L(fwd_write_16bytes): + movq -16(%eax), %xmm0 + movq %xmm0, -16(%edx) +L(fwd_write_8bytes): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes): +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_5bytes): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_45bytes): + movq -45(%eax), %xmm0 + movq %xmm0, -45(%edx) +L(fwd_write_37bytes): + movq -37(%eax), %xmm0 + movq %xmm0, -37(%edx) +L(fwd_write_29bytes): + movq -29(%eax), %xmm0 + movq %xmm0, -29(%edx) +L(fwd_write_21bytes): + movq -21(%eax), %xmm0 + movq %xmm0, -21(%edx) +L(fwd_write_13bytes): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_41bytes): + movq -41(%eax), %xmm0 + movq %xmm0, -41(%edx) +L(fwd_write_33bytes): + movq -33(%eax), %xmm0 + movq %xmm0, -33(%edx) +L(fwd_write_25bytes): + movq -25(%eax), %xmm0 + movq %xmm0, -25(%edx) +L(fwd_write_17bytes): + movq -17(%eax), %xmm0 + movq %xmm0, -17(%edx) +L(fwd_write_9bytes): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_46bytes): + movq -46(%eax), %xmm0 + movq %xmm0, -46(%edx) +L(fwd_write_38bytes): + movq -38(%eax), %xmm0 + movq %xmm0, -38(%edx) +L(fwd_write_30bytes): + movq -30(%eax), %xmm0 + movq %xmm0, -30(%edx) +L(fwd_write_22bytes): + movq -22(%eax), %xmm0 + movq %xmm0, -22(%edx) +L(fwd_write_14bytes): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_42bytes): + movq -42(%eax), %xmm0 + movq %xmm0, -42(%edx) +L(fwd_write_34bytes): + movq -34(%eax), %xmm0 + movq %xmm0, -34(%edx) +L(fwd_write_26bytes): + movq -26(%eax), %xmm0 + movq %xmm0, -26(%edx) +L(fwd_write_18bytes): + movq -18(%eax), %xmm0 + movq %xmm0, -18(%edx) +L(fwd_write_10bytes): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_47bytes): + movq -47(%eax), %xmm0 + movq %xmm0, -47(%edx) +L(fwd_write_39bytes): + movq -39(%eax), %xmm0 + movq %xmm0, -39(%edx) +L(fwd_write_31bytes): + movq -31(%eax), %xmm0 + movq %xmm0, -31(%edx) +L(fwd_write_23bytes): + movq -23(%eax), %xmm0 + movq %xmm0, -23(%edx) +L(fwd_write_15bytes): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_43bytes): + movq -43(%eax), %xmm0 + movq %xmm0, -43(%edx) +L(fwd_write_35bytes): + movq -35(%eax), %xmm0 + movq %xmm0, -35(%edx) +L(fwd_write_27bytes): + movq -27(%eax), %xmm0 + movq %xmm0, -27(%edx) +L(fwd_write_19bytes): + movq -19(%eax), %xmm0 + movq %xmm0, -19(%edx) +L(fwd_write_11bytes): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_40bytes_align): + movdqa -40(%eax), %xmm0 + movdqa %xmm0, -40(%edx) +L(fwd_write_24bytes_align): + movdqa -24(%eax), %xmm0 + movdqa %xmm0, -24(%edx) +L(fwd_write_8bytes_align): + movq -8(%eax), %xmm0 + movq %xmm0, -8(%edx) +L(fwd_write_0bytes_align): +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_32bytes_align): + movdqa -32(%eax), %xmm0 + movdqa %xmm0, -32(%edx) +L(fwd_write_16bytes_align): + movdqa -16(%eax), %xmm0 + movdqa %xmm0, -16(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_5bytes_align): + movl -5(%eax), %ecx + movl -4(%eax), %eax + movl %ecx, -5(%edx) + movl %eax, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_45bytes_align): + movdqa -45(%eax), %xmm0 + movdqa %xmm0, -45(%edx) +L(fwd_write_29bytes_align): + movdqa -29(%eax), %xmm0 + movdqa %xmm0, -29(%edx) +L(fwd_write_13bytes_align): + movq -13(%eax), %xmm0 + movq %xmm0, -13(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_37bytes_align): + movdqa -37(%eax), %xmm0 + movdqa %xmm0, -37(%edx) +L(fwd_write_21bytes_align): + movdqa -21(%eax), %xmm0 + movdqa %xmm0, -21(%edx) + movl -5(%eax), %ecx + movl %ecx, -5(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_41bytes_align): + movdqa -41(%eax), %xmm0 + movdqa %xmm0, -41(%edx) +L(fwd_write_25bytes_align): + movdqa -25(%eax), %xmm0 + movdqa %xmm0, -25(%edx) +L(fwd_write_9bytes_align): + movq -9(%eax), %xmm0 + movq %xmm0, -9(%edx) +L(fwd_write_1bytes_align): + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_33bytes_align): + movdqa -33(%eax), %xmm0 + movdqa %xmm0, -33(%edx) +L(fwd_write_17bytes_align): + movdqa -17(%eax), %xmm0 + movdqa %xmm0, -17(%edx) + movzbl -1(%eax), %ecx + movb %cl, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_46bytes_align): + movdqa -46(%eax), %xmm0 + movdqa %xmm0, -46(%edx) +L(fwd_write_30bytes_align): + movdqa -30(%eax), %xmm0 + movdqa %xmm0, -30(%edx) +L(fwd_write_14bytes_align): + movq -14(%eax), %xmm0 + movq %xmm0, -14(%edx) +L(fwd_write_6bytes_align): + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_38bytes_align): + movdqa -38(%eax), %xmm0 + movdqa %xmm0, -38(%edx) +L(fwd_write_22bytes_align): + movdqa -22(%eax), %xmm0 + movdqa %xmm0, -22(%edx) + movl -6(%eax), %ecx + movl %ecx, -6(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_42bytes_align): + movdqa -42(%eax), %xmm0 + movdqa %xmm0, -42(%edx) +L(fwd_write_26bytes_align): + movdqa -26(%eax), %xmm0 + movdqa %xmm0, -26(%edx) +L(fwd_write_10bytes_align): + movq -10(%eax), %xmm0 + movq %xmm0, -10(%edx) +L(fwd_write_2bytes_align): + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_34bytes_align): + movdqa -34(%eax), %xmm0 + movdqa %xmm0, -34(%edx) +L(fwd_write_18bytes_align): + movdqa -18(%eax), %xmm0 + movdqa %xmm0, -18(%edx) + movzwl -2(%eax), %ecx + movw %cx, -2(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_47bytes_align): + movdqa -47(%eax), %xmm0 + movdqa %xmm0, -47(%edx) +L(fwd_write_31bytes_align): + movdqa -31(%eax), %xmm0 + movdqa %xmm0, -31(%edx) +L(fwd_write_15bytes_align): + movq -15(%eax), %xmm0 + movq %xmm0, -15(%edx) +L(fwd_write_7bytes_align): + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_39bytes_align): + movdqa -39(%eax), %xmm0 + movdqa %xmm0, -39(%edx) +L(fwd_write_23bytes_align): + movdqa -23(%eax), %xmm0 + movdqa %xmm0, -23(%edx) + movl -7(%eax), %ecx + movl %ecx, -7(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_43bytes_align): + movdqa -43(%eax), %xmm0 + movdqa %xmm0, -43(%edx) +L(fwd_write_27bytes_align): + movdqa -27(%eax), %xmm0 + movdqa %xmm0, -27(%edx) +L(fwd_write_11bytes_align): + movq -11(%eax), %xmm0 + movq %xmm0, -11(%edx) +L(fwd_write_3bytes_align): + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_35bytes_align): + movdqa -35(%eax), %xmm0 + movdqa %xmm0, -35(%edx) +L(fwd_write_19bytes_align): + movdqa -19(%eax), %xmm0 + movdqa %xmm0, -19(%edx) + movzwl -3(%eax), %ecx + movzbl -1(%eax), %eax + movw %cx, -3(%edx) + movb %al, -1(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_44bytes_align): + movdqa -44(%eax), %xmm0 + movdqa %xmm0, -44(%edx) +L(fwd_write_28bytes_align): + movdqa -28(%eax), %xmm0 + movdqa %xmm0, -28(%edx) +L(fwd_write_12bytes_align): + movq -12(%eax), %xmm0 + movq %xmm0, -12(%edx) +L(fwd_write_4bytes_align): + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN + + .p2align 4 +L(fwd_write_36bytes_align): + movdqa -36(%eax), %xmm0 + movdqa %xmm0, -36(%edx) +L(fwd_write_20bytes_align): + movdqa -20(%eax), %xmm0 + movdqa %xmm0, -20(%edx) + movl -4(%eax), %ecx + movl %ecx, -4(%edx) +# ifndef USE_AS_BCOPY +# ifdef USE_AS_MEMPCPY + movl %edx, %eax +# else + movl DEST(%esp), %eax +# endif +# endif + RETURN_END + + CFI_PUSH (%edi) + + .p2align 4 +L(large_page): + movdqu (%eax), %xmm1 +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif + lea 16(%eax), %eax + movntdq %xmm1, (%edx) + lea 16(%edx), %edx + lea -0x90(%ecx), %ecx + POP (%edi) + + .p2align 4 +L(large_page_loop): + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + movdqu 0x40(%eax), %xmm4 + movdqu 0x50(%eax), %xmm5 + movdqu 0x60(%eax), %xmm6 + movdqu 0x70(%eax), %xmm7 + lea 0x80(%eax), %eax + + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + movntdq %xmm4, 0x40(%edx) + movntdq %xmm5, 0x50(%edx) + movntdq %xmm6, 0x60(%edx) + movntdq %xmm7, 0x70(%edx) + lea 0x80(%edx), %edx + jae L(large_page_loop) + cmp $-0x40, %ecx + lea 0x80(%ecx), %ecx + jl L(large_page_less_64bytes) + + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + movdqu 0x20(%eax), %xmm2 + movdqu 0x30(%eax), %xmm3 + lea 0x40(%eax), %eax + + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + movntdq %xmm2, 0x20(%edx) + movntdq %xmm3, 0x30(%edx) + lea 0x40(%edx), %edx + sub $0x40, %ecx +L(large_page_less_64bytes): + cmp $32, %ecx + jb L(large_page_less_32bytes) + movdqu (%eax), %xmm0 + movdqu 0x10(%eax), %xmm1 + lea 0x20(%eax), %eax + movntdq %xmm0, (%edx) + movntdq %xmm1, 0x10(%edx) + lea 0x20(%edx), %edx + sub $0x20, %ecx +L(large_page_less_32bytes): + add %ecx, %edx + add %ecx, %eax + sfence + BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + + .p2align 4 +L(bk_write_44bytes): + movq 36(%eax), %xmm0 + movq %xmm0, 36(%edx) +L(bk_write_36bytes): + movq 28(%eax), %xmm0 + movq %xmm0, 28(%edx) +L(bk_write_28bytes): + movq 20(%eax), %xmm0 + movq %xmm0, 20(%edx) +L(bk_write_20bytes): + movq 12(%eax), %xmm0 + movq %xmm0, 12(%edx) +L(bk_write_12bytes): + movq 4(%eax), %xmm0 + movq %xmm0, 4(%edx) +L(bk_write_4bytes): + movl (%eax), %ecx + movl %ecx, (%edx) +L(bk_write_0bytes): +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_40bytes): + movq 32(%eax), %xmm0 + movq %xmm0, 32(%edx) +L(bk_write_32bytes): + movq 24(%eax), %xmm0 + movq %xmm0, 24(%edx) +L(bk_write_24bytes): + movq 16(%eax), %xmm0 + movq %xmm0, 16(%edx) +L(bk_write_16bytes): + movq 8(%eax), %xmm0 + movq %xmm0, 8(%edx) +L(bk_write_8bytes): + movq (%eax), %xmm0 + movq %xmm0, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_45bytes): + movq 37(%eax), %xmm0 + movq %xmm0, 37(%edx) +L(bk_write_37bytes): + movq 29(%eax), %xmm0 + movq %xmm0, 29(%edx) +L(bk_write_29bytes): + movq 21(%eax), %xmm0 + movq %xmm0, 21(%edx) +L(bk_write_21bytes): + movq 13(%eax), %xmm0 + movq %xmm0, 13(%edx) +L(bk_write_13bytes): + movq 5(%eax), %xmm0 + movq %xmm0, 5(%edx) +L(bk_write_5bytes): + movl 1(%eax), %ecx + movl %ecx, 1(%edx) +L(bk_write_1bytes): + movzbl (%eax), %ecx + movb %cl, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_41bytes): + movq 33(%eax), %xmm0 + movq %xmm0, 33(%edx) +L(bk_write_33bytes): + movq 25(%eax), %xmm0 + movq %xmm0, 25(%edx) +L(bk_write_25bytes): + movq 17(%eax), %xmm0 + movq %xmm0, 17(%edx) +L(bk_write_17bytes): + movq 9(%eax), %xmm0 + movq %xmm0, 9(%edx) +L(bk_write_9bytes): + movq 1(%eax), %xmm0 + movq %xmm0, 1(%edx) + movzbl (%eax), %ecx + movb %cl, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_46bytes): + movq 38(%eax), %xmm0 + movq %xmm0, 38(%edx) +L(bk_write_38bytes): + movq 30(%eax), %xmm0 + movq %xmm0, 30(%edx) +L(bk_write_30bytes): + movq 22(%eax), %xmm0 + movq %xmm0, 22(%edx) +L(bk_write_22bytes): + movq 14(%eax), %xmm0 + movq %xmm0, 14(%edx) +L(bk_write_14bytes): + movq 6(%eax), %xmm0 + movq %xmm0, 6(%edx) +L(bk_write_6bytes): + movl 2(%eax), %ecx + movl %ecx, 2(%edx) + movzwl (%eax), %ecx + movw %cx, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_42bytes): + movq 34(%eax), %xmm0 + movq %xmm0, 34(%edx) +L(bk_write_34bytes): + movq 26(%eax), %xmm0 + movq %xmm0, 26(%edx) +L(bk_write_26bytes): + movq 18(%eax), %xmm0 + movq %xmm0, 18(%edx) +L(bk_write_18bytes): + movq 10(%eax), %xmm0 + movq %xmm0, 10(%edx) +L(bk_write_10bytes): + movq 2(%eax), %xmm0 + movq %xmm0, 2(%edx) +L(bk_write_2bytes): + movzwl (%eax), %ecx + movw %cx, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_47bytes): + movq 39(%eax), %xmm0 + movq %xmm0, 39(%edx) +L(bk_write_39bytes): + movq 31(%eax), %xmm0 + movq %xmm0, 31(%edx) +L(bk_write_31bytes): + movq 23(%eax), %xmm0 + movq %xmm0, 23(%edx) +L(bk_write_23bytes): + movq 15(%eax), %xmm0 + movq %xmm0, 15(%edx) +L(bk_write_15bytes): + movq 7(%eax), %xmm0 + movq %xmm0, 7(%edx) +L(bk_write_7bytes): + movl 3(%eax), %ecx + movl %ecx, 3(%edx) + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN + + .p2align 4 +L(bk_write_43bytes): + movq 35(%eax), %xmm0 + movq %xmm0, 35(%edx) +L(bk_write_35bytes): + movq 27(%eax), %xmm0 + movq %xmm0, 27(%edx) +L(bk_write_27bytes): + movq 19(%eax), %xmm0 + movq %xmm0, 19(%edx) +L(bk_write_19bytes): + movq 11(%eax), %xmm0 + movq %xmm0, 11(%edx) +L(bk_write_11bytes): + movq 3(%eax), %xmm0 + movq %xmm0, 3(%edx) +L(bk_write_3bytes): + movzwl 1(%eax), %ecx + movw %cx, 1(%edx) + movzbl (%eax), %eax + movb %al, (%edx) +# ifndef USE_AS_BCOPY + movl DEST(%esp), %eax +# ifdef USE_AS_MEMPCPY + movl LEN(%esp), %ecx + add %ecx, %eax +# endif +# endif + RETURN_END + + + .pushsection .rodata.ssse3,"a",@progbits + .p2align 2 +L(table_48bytes_fwd): + .int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd)) + .int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd)) + + .p2align 2 +L(table_48bytes_fwd_align): + .int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align)) + .int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align)) + + .p2align 2 +L(shl_table): + .int JMPTBL (L(shl_0), L(shl_table)) + .int JMPTBL (L(shl_1), L(shl_table)) + .int JMPTBL (L(shl_2), L(shl_table)) + .int JMPTBL (L(shl_3), L(shl_table)) + .int JMPTBL (L(shl_4), L(shl_table)) + .int JMPTBL (L(shl_5), L(shl_table)) + .int JMPTBL (L(shl_6), L(shl_table)) + .int JMPTBL (L(shl_7), L(shl_table)) + .int JMPTBL (L(shl_8), L(shl_table)) + .int JMPTBL (L(shl_9), L(shl_table)) + .int JMPTBL (L(shl_10), L(shl_table)) + .int JMPTBL (L(shl_11), L(shl_table)) + .int JMPTBL (L(shl_12), L(shl_table)) + .int JMPTBL (L(shl_13), L(shl_table)) + .int JMPTBL (L(shl_14), L(shl_table)) + .int JMPTBL (L(shl_15), L(shl_table)) + + .p2align 2 +L(table_48_bytes_bwd): + .int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd)) + .int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd)) + + .popsection + +# ifdef USE_AS_MEMMOVE + .p2align 4 +L(copy_backward): + PUSH (%edi) + movl %eax, %edi + lea (%ecx,%edx,1),%edx + lea (%ecx,%edi,1),%edi + testl $0x3, %edx + jnz L(bk_align) + +L(bk_aligned_4): + cmp $64, %ecx + jae L(bk_write_more64bytes) + +L(bk_write_64bytesless): + cmp $32, %ecx + jb L(bk_write_less32bytes) + +L(bk_write_more32bytes): + /* Copy 32 bytes at a time. */ + sub $32, %ecx + movq -8(%edi), %xmm0 + movq %xmm0, -8(%edx) + movq -16(%edi), %xmm0 + movq %xmm0, -16(%edx) + movq -24(%edi), %xmm0 + movq %xmm0, -24(%edx) + movq -32(%edi), %xmm0 + movq %xmm0, -32(%edx) + sub $32, %edx + sub $32, %edi + +L(bk_write_less32bytes): + movl %edi, %eax + sub %ecx, %edx + sub %ecx, %eax + POP (%edi) +L(bk_write_less32bytes_2): + BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(bk_align): + cmp $8, %ecx + jbe L(bk_write_less32bytes) + testl $1, %edx + /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, + then (EDX & 2) must be != 0. */ + jz L(bk_got2) + sub $1, %edi + sub $1, %ecx + sub $1, %edx + movzbl (%edi), %eax + movb %al, (%edx) + + testl $2, %edx + jz L(bk_aligned_4) + +L(bk_got2): + sub $2, %edi + sub $2, %ecx + sub $2, %edx + movzwl (%edi), %eax + movw %ax, (%edx) + jmp L(bk_aligned_4) + + .p2align 4 +L(bk_write_more64bytes): + /* Check alignment of last byte. */ + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + +/* EDX is aligned 4 bytes, but not 16 bytes. */ +L(bk_ssse3_align): + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + + testl $15, %edx + jz L(bk_ssse3_cpy_pre) + + sub $4, %edi + sub $4, %ecx + sub $4, %edx + movl (%edi), %eax + movl %eax, (%edx) + +L(bk_ssse3_cpy_pre): + cmp $64, %ecx + jb L(bk_write_more32bytes) + + .p2align 4 +L(bk_ssse3_cpy): + sub $64, %edi + sub $64, %ecx + sub $64, %edx + movdqu 0x30(%edi), %xmm3 + movdqa %xmm3, 0x30(%edx) + movdqu 0x20(%edi), %xmm2 + movdqa %xmm2, 0x20(%edx) + movdqu 0x10(%edi), %xmm1 + movdqa %xmm1, 0x10(%edx) + movdqu (%edi), %xmm0 + movdqa %xmm0, (%edx) + cmp $64, %ecx + jae L(bk_ssse3_cpy) + jmp L(bk_write_64bytesless) + +# endif + +END (MEMCPY) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S new file mode 100644 index 0000000000..f725944620 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy.S @@ -0,0 +1,78 @@ +/* Multiple versions of memcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need memcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(memcpy) + .type memcpy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memcpy_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep) +2: ret +END(memcpy) + +# undef ENTRY +# define ENTRY(name) \ + .type __memcpy_ia32, @function; \ + .p2align 4; \ + .globl __memcpy_ia32; \ + .hidden __memcpy_ia32; \ + __memcpy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memcpy_ia32, .-__memcpy_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memcpy_chk_ia32, @function; \ + .globl __memcpy_chk_ia32; \ + .p2align 4; \ + __memcpy_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memcpy_chk_ia32, .-__memcpy_chk_ia32 + +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memcpy; __GI_memcpy = __memcpy_ia32 +#endif + +#include "../memcpy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S new file mode 100644 index 0000000000..1b4fbe2e6f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memcpy_chk.S @@ -0,0 +1,50 @@ +/* Multiple versions of __memcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch memcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__memcpy_chk) + .type __memcpy_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep) +2: ret +END(__memcpy_chk) +# else +# include "../memcpy_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S new file mode 100644 index 0000000000..3873594cb2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_sse2_unaligned +#define MEMCPY_CHK __memmove_chk_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S new file mode 100644 index 0000000000..d202fc4a13 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3_rep +#define MEMCPY_CHK __memmove_chk_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S new file mode 100644 index 0000000000..295430b1ef --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY __memmove_ssse3 +#define MEMCPY_CHK __memmove_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S new file mode 100644 index 0000000000..6eb418ca7f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove.S @@ -0,0 +1,89 @@ +/* Multiple versions of memmove + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(memmove) + .type memmove, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memmove_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep) +2: ret +END(memmove) + +# ifdef SHARED +# undef ENTRY +# define ENTRY(name) \ + .type __memmove_ia32, @function; \ + .p2align 4; \ + .globl __memmove_ia32; \ + .hidden __memmove_ia32; \ + __memmove_ia32: cfi_startproc; \ + CALL_MCOUNT +# else +# undef ENTRY +# define ENTRY(name) \ + .type __memmove_ia32, @function; \ + .globl __memmove_ia32; \ + .p2align 4; \ + __memmove_ia32: cfi_startproc; \ + CALL_MCOUNT +# endif + +# undef END +# define END(name) \ + cfi_endproc; .size __memmove_ia32, .-__memmove_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memmove_chk_ia32, @function; \ + .globl __memmove_chk_ia32; \ + .p2align 4; \ + __memmove_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memmove_chk_ia32, .-__memmove_chk_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memmove; __GI_memmove = __memmove_ia32 +# endif +#endif + +#include "../memmove.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S new file mode 100644 index 0000000000..314834c4c6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memmove_chk.S @@ -0,0 +1,94 @@ +/* Multiple versions of __memmove_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(__memmove_chk) + .type __memmove_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memmove_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep) +2: ret +END(__memmove_chk) + +# ifndef SHARED + .type __memmove_chk_sse2_unaligned, @function + .p2align 4; +__memmove_chk_sse2_unaligned: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_sse2_unaligned + cfi_endproc + .size __memmove_chk_sse2_unaligned, .-__memmove_chk_sse2_unaligned + + .type __memmove_chk_ssse3, @function + .p2align 4; +__memmove_chk_ssse3: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ssse3 + cfi_endproc + .size __memmove_chk_ssse3, .-__memmove_chk_ssse3 + + .type __memmove_chk_ssse3_rep, @function + .p2align 4; +__memmove_chk_ssse3_rep: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ssse3_rep + cfi_endproc + .size __memmove_chk_ssse3_rep, .-__memmove_chk_ssse3_rep + + .type __memmove_chk_ia32, @function + .p2align 4; +__memmove_chk_ia32: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memmove_ia32 + cfi_endproc + .size __memmove_chk_ia32, .-__memmove_chk_ia32 +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S new file mode 100644 index 0000000000..a1cea50771 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-sse2-unaligned.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_sse2_unaligned +#define MEMCPY_CHK __mempcpy_chk_sse2_unaligned +#include "memcpy-sse2-unaligned.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S new file mode 100644 index 0000000000..5357b33e18 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3-rep.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_ssse3_rep +#define MEMCPY_CHK __mempcpy_chk_ssse3_rep +#include "memcpy-ssse3-rep.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S new file mode 100644 index 0000000000..822d98e954 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy_ssse3 +#define MEMCPY_CHK __mempcpy_chk_ssse3 +#include "memcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S new file mode 100644 index 0000000000..06e377fbc9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy.S @@ -0,0 +1,81 @@ +/* Multiple versions of mempcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. In static binaries we need mempcpy before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(__mempcpy) + .type __mempcpy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__mempcpy_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep) +2: ret +END(__mempcpy) + +# undef ENTRY +# define ENTRY(name) \ + .type __mempcpy_ia32, @function; \ + .p2align 4; \ + .globl __mempcpy_ia32; \ + .hidden __mempcpy_ia32; \ + __mempcpy_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __mempcpy_ia32, .-__mempcpy_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __mempcpy_chk_ia32, @function; \ + .globl __mempcpy_chk_ia32; \ + .p2align 4; \ + __mempcpy_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __mempcpy_chk_ia32, .-__mempcpy_chk_ia32 + +# undef libc_hidden_def +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_def(name) \ + .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_ia32 +# define libc_hidden_builtin_def(name) \ + .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_ia32 +#endif + +#include "../mempcpy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S new file mode 100644 index 0000000000..e13e5248a5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/mempcpy_chk.S @@ -0,0 +1,50 @@ +/* Multiple versions of __mempcpy_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib and for + DSO. There are no multiarch mempcpy functions for static binaries. + */ +#if IS_IN (libc) +# ifdef SHARED + .text +ENTRY(__mempcpy_chk) + .type __mempcpy_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep) +2: ret +END(__mempcpy_chk) +# else +# include "../mempcpy_chk.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c new file mode 100644 index 0000000000..ef7bbbe792 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-c.c @@ -0,0 +1,7 @@ +#if IS_IN (libc) +# define MEMRCHR __memrchr_ia32 +# include <string.h> +extern void *__memrchr_ia32 (const void *, int, size_t); +#endif + +#include "string/memrchr.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S new file mode 100644 index 0000000000..dbbe94fd08 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S @@ -0,0 +1,417 @@ +/* Optimized memrchr with sse2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# define MEMCHR __memrchr_sse2_bsf + + .text +ENTRY (MEMCHR) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + add $16, %ecx + add $16, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches0) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches0) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + add $64, %ecx + add $64, %edx + sub %eax, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + bsr %eax, %eax + + add %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches0): + bsr %eax, %eax + add %ecx, %eax + ret + + .p2align 4 +L(matches16): + bsr %eax, %eax + lea 16(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches32): + bsr %eax, %eax + lea 32(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches48): + bsr %eax, %eax + lea 48(%eax, %ecx), %eax + ret + + .p2align 4 +L(matches0_1): + bsr %eax, %eax + sub $64, %edx + add %eax, %edx + jl L(return_null) + add %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + bsr %eax, %eax + sub $48, %edx + add %eax, %edx + jl L(return_null) + lea 16(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches32_1): + bsr %eax, %eax + sub $32, %edx + add %eax, %edx + jl L(return_null) + lea 32(%ecx, %eax), %eax + ret + + .p2align 4 +L(matches48_1): + bsr %eax, %eax + sub $16, %edx + add %eax, %edx + jl L(return_null) + lea 48(%ecx, %eax), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + mov %edx, %ecx + + pmovmskb %xmm1, %edx + + and %ecx, %edx + test %edx, %edx + jz L(return_null) + + bsr %edx, %ecx + add %ecx, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + mov %ecx, %eax + punpcklbw %xmm1, %xmm1 + add $16, %edx + jz L(return_null) + + pshufd $0, %xmm1, %xmm1 + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (MEMCHR) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S new file mode 100644 index 0000000000..5f7853f683 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr-sse2.S @@ -0,0 +1,724 @@ +/* Optimized memrchr with sse2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + + atom_text_section +ENTRY (__memrchr_sse2) + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + + sub $16, %edx + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + add %edx, %ecx + punpcklbw %xmm1, %xmm1 + + movdqu (%ecx), %xmm0 + pshufd $0, %xmm1, %xmm1 + pcmpeqb %xmm1, %xmm0 + + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(exit_dispatch) + + sub $64, %ecx + mov %ecx, %eax + and $15, %eax + jz L(loop_prolog) + + lea 16(%ecx), %ecx + lea 16(%edx), %edx + sub %eax, %edx + and $-16, %ecx + + .p2align 4 +/* Loop start on aligned string. */ +L(loop_prolog): + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm4 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(exit_dispatch) + + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16) + + movdqa (%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(exit_dispatch) + + mov %ecx, %eax + and $63, %eax + test %eax, %eax + jz L(align64_loop) + + lea 64(%ecx), %ecx + lea 64(%edx), %edx + and $-64, %ecx + sub %eax, %edx + + .p2align 4 +L(align64_loop): + sub $64, %ecx + sub $64, %edx + jbe L(exit_loop) + + movdqa (%ecx), %xmm0 + movdqa 16(%ecx), %xmm2 + movdqa 32(%ecx), %xmm3 + movdqa 48(%ecx), %xmm4 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm3, %xmm0 + pmaxub %xmm4, %xmm2 + pmaxub %xmm0, %xmm2 + pmovmskb %xmm2, %eax + + test %eax, %eax + jz L(align64_loop) + + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(matches48) + + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm2 + + pcmpeqb %xmm1, %xmm2 + pcmpeqb (%ecx), %xmm1 + + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches16) + + pmovmskb %xmm1, %eax + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_loop): + add $64, %edx + cmp $32, %edx + jbe L(exit_loop_32) + + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48) + + movdqa 32(%ecx), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(matches32) + + movdqa 16(%ecx), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches16_1) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb (%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches0_1) + xor %eax, %eax + ret + + .p2align 4 +L(exit_loop_32): + movdqa 48(%ecx), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches48_1) + cmp $16, %edx + jbe L(return_null) + + pcmpeqb 32(%ecx), %xmm1 + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(matches32_1) + xor %eax, %eax + ret + + .p2align 4 +L(matches16): + lea 16(%ecx), %ecx + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(matches32): + lea 32(%ecx), %ecx + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(matches48): + lea 48(%ecx), %ecx + + .p2align 4 +L(exit_dispatch): + test %ah, %ah + jnz L(exit_dispatch_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(exit_dispatch_8) + test $0x08, %al + jnz L(exit_4) + test $0x04, %al + jnz L(exit_3) + test $0x02, %al + jnz L(exit_2) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_dispatch_8): + test $0x80, %al + jnz L(exit_8) + test $0x40, %al + jnz L(exit_7) + test $0x20, %al + jnz L(exit_6) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_high): + mov %ah, %dh + and $15 << 4, %dh + jnz L(exit_dispatch_high_8) + test $0x08, %ah + jnz L(exit_12) + test $0x04, %ah + jnz L(exit_11) + test $0x02, %ah + jnz L(exit_10) + lea 8(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_high_8): + test $0x80, %ah + jnz L(exit_16) + test $0x40, %ah + jnz L(exit_15) + test $0x20, %ah + jnz L(exit_14) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit_2): + lea 1(%ecx), %eax + ret + + .p2align 4 +L(exit_3): + lea 2(%ecx), %eax + ret + + .p2align 4 +L(exit_4): + lea 3(%ecx), %eax + ret + + .p2align 4 +L(exit_6): + lea 5(%ecx), %eax + ret + + .p2align 4 +L(exit_7): + lea 6(%ecx), %eax + ret + + .p2align 4 +L(exit_8): + lea 7(%ecx), %eax + ret + + .p2align 4 +L(exit_10): + lea 9(%ecx), %eax + ret + + .p2align 4 +L(exit_11): + lea 10(%ecx), %eax + ret + + .p2align 4 +L(exit_12): + lea 11(%ecx), %eax + ret + + .p2align 4 +L(exit_14): + lea 13(%ecx), %eax + ret + + .p2align 4 +L(exit_15): + lea 14(%ecx), %eax + ret + + .p2align 4 +L(exit_16): + lea 15(%ecx), %eax + ret + + .p2align 4 +L(matches0_1): + lea -64(%edx), %edx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches16_1): + lea -48(%edx), %edx + lea 16(%ecx), %ecx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches32_1): + lea -32(%edx), %edx + lea 32(%ecx), %ecx + + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(matches48_1): + lea -16(%edx), %edx + lea 48(%ecx), %ecx + + .p2align 4 +L(exit_dispatch_1): + test %ah, %ah + jnz L(exit_dispatch_1_high) + mov %al, %ah + and $15 << 4, %ah + jnz L(exit_dispatch_1_8) + test $0x08, %al + jnz L(exit_1_4) + test $0x04, %al + jnz L(exit_1_3) + test $0x02, %al + jnz L(exit_1_2) + add $0, %edx + jl L(return_null) + mov %ecx, %eax + ret + + .p2align 4 +L(exit_dispatch_1_8): + test $0x80, %al + jnz L(exit_1_8) + test $0x40, %al + jnz L(exit_1_7) + test $0x20, %al + jnz L(exit_1_6) + add $4, %edx + jl L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_1_high): + mov %ah, %al + and $15 << 4, %al + jnz L(exit_dispatch_1_high_8) + test $0x08, %ah + jnz L(exit_1_12) + test $0x04, %ah + jnz L(exit_1_11) + test $0x02, %ah + jnz L(exit_1_10) + add $8, %edx + jl L(return_null) + lea 8(%ecx), %eax + ret + + .p2align 4 +L(exit_dispatch_1_high_8): + test $0x80, %ah + jnz L(exit_1_16) + test $0x40, %ah + jnz L(exit_1_15) + test $0x20, %ah + jnz L(exit_1_14) + add $12, %edx + jl L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit_1_2): + add $1, %edx + jl L(return_null) + lea 1(%ecx), %eax + ret + + .p2align 4 +L(exit_1_3): + add $2, %edx + jl L(return_null) + lea 2(%ecx), %eax + ret + + .p2align 4 +L(exit_1_4): + add $3, %edx + jl L(return_null) + lea 3(%ecx), %eax + ret + + .p2align 4 +L(exit_1_6): + add $5, %edx + jl L(return_null) + lea 5(%ecx), %eax + ret + + .p2align 4 +L(exit_1_7): + add $6, %edx + jl L(return_null) + lea 6(%ecx), %eax + ret + + .p2align 4 +L(exit_1_8): + add $7, %edx + jl L(return_null) + lea 7(%ecx), %eax + ret + + .p2align 4 +L(exit_1_10): + add $9, %edx + jl L(return_null) + lea 9(%ecx), %eax + ret + + .p2align 4 +L(exit_1_11): + add $10, %edx + jl L(return_null) + lea 10(%ecx), %eax + ret + + .p2align 4 +L(exit_1_12): + add $11, %edx + jl L(return_null) + lea 11(%ecx), %eax + ret + + .p2align 4 +L(exit_1_14): + add $13, %edx + jl L(return_null) + lea 13(%ecx), %eax + ret + + .p2align 4 +L(exit_1_15): + add $14, %edx + jl L(return_null) + lea 14(%ecx), %eax + ret + + .p2align 4 +L(exit_1_16): + add $15, %edx + jl L(return_null) + lea 15(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + + .p2align 4 +L(length_less16_offset0): + mov %dl, %cl + pcmpeqb (%eax), %xmm1 + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + mov %eax, %ecx + pmovmskb %xmm1, %eax + + and %edx, %eax + test %eax, %eax + jnz L(exit_dispatch) + + xor %eax, %eax + ret + + .p2align 4 +L(length_less16): + punpcklbw %xmm1, %xmm1 + add $16, %edx + je L(return_null) + punpcklbw %xmm1, %xmm1 + + mov %ecx, %eax + pshufd $0, %xmm1, %xmm1 + + and $15, %ecx + jz L(length_less16_offset0) + + PUSH (%edi) + + mov %cl, %dh + add %dl, %dh + and $-16, %eax + + sub $16, %dh + ja L(length_less16_part2) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + sar %cl, %edi + add %ecx, %eax + mov %dl, %cl + + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2): + movdqa 16(%eax), %xmm2 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm2, %edi + + mov %cl, %ch + + mov %dh, %cl + mov $1, %edx + sal %cl, %edx + sub $1, %edx + + and %edx, %edi + + test %edi, %edi + jnz L(length_less16_part2_return) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edi + + mov %ch, %cl + sar %cl, %edi + test %edi, %edi + jz L(ret_null) + + bsr %edi, %edi + add %edi, %eax + xor %ch, %ch + add %ecx, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(length_less16_part2_return): + bsr %edi, %edi + lea 16(%eax, %edi), %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(ret_null): + xor %eax, %eax + POP (%edi) + ret + +END (__memrchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S new file mode 100644 index 0000000000..d4253a553b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memrchr.S @@ -0,0 +1,45 @@ +/* Multiple versions of memrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__memrchr) + .type __memrchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX (__memrchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__memrchr_ia32) + ret + +3: LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf) + ret +END(__memrchr) + +weak_alias(__memrchr, memrchr) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S new file mode 100644 index 0000000000..3221077e49 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2-rep.S @@ -0,0 +1,811 @@ +/* memset with SSE2 and REP string. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_BZERO +# define DEST PARMS +# define LEN DEST+4 +# define SETRTNVAL +#else +# define DEST PARMS +# define CHR DEST+4 +# define LEN CHR+4 +# define SETRTNVAL movl DEST(%esp), %eax +#endif + +#ifdef SHARED +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define PARMS 8 /* Preserve EBX. */ +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + add $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + add (%ebx,%ecx,4), %ebx; \ + add %ecx, %edx; \ + /* We loaded the jump table and adjusted EDX. Go. */ \ + jmp *%ebx +#else +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define PARMS 4 +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + add %ecx, %edx; \ + jmp *TABLE(,%ecx,4) +#endif + + .section .text.sse2,"ax",@progbits +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO +ENTRY (__memset_chk_sse2_rep) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_sse2_rep) +#endif +ENTRY (__memset_sse2_rep) + ENTRANCE + + movl LEN(%esp), %ecx +#ifdef USE_AS_BZERO + xor %eax, %eax +#else + movzbl CHR(%esp), %eax + movb %al, %ah + /* Fill the whole EAX with pattern. */ + movl %eax, %edx + shl $16, %eax + or %edx, %eax +#endif + movl DEST(%esp), %edx + cmp $32, %ecx + jae L(32bytesormore) + +L(write_less32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_less_32bytes): + .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) + .popsection + + ALIGN (4) +L(write_28bytes): + movl %eax, -28(%edx) +L(write_24bytes): + movl %eax, -24(%edx) +L(write_20bytes): + movl %eax, -20(%edx) +L(write_16bytes): + movl %eax, -16(%edx) +L(write_12bytes): + movl %eax, -12(%edx) +L(write_8bytes): + movl %eax, -8(%edx) +L(write_4bytes): + movl %eax, -4(%edx) +L(write_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(write_29bytes): + movl %eax, -29(%edx) +L(write_25bytes): + movl %eax, -25(%edx) +L(write_21bytes): + movl %eax, -21(%edx) +L(write_17bytes): + movl %eax, -17(%edx) +L(write_13bytes): + movl %eax, -13(%edx) +L(write_9bytes): + movl %eax, -9(%edx) +L(write_5bytes): + movl %eax, -5(%edx) +L(write_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_30bytes): + movl %eax, -30(%edx) +L(write_26bytes): + movl %eax, -26(%edx) +L(write_22bytes): + movl %eax, -22(%edx) +L(write_18bytes): + movl %eax, -18(%edx) +L(write_14bytes): + movl %eax, -14(%edx) +L(write_10bytes): + movl %eax, -10(%edx) +L(write_6bytes): + movl %eax, -6(%edx) +L(write_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_31bytes): + movl %eax, -31(%edx) +L(write_27bytes): + movl %eax, -27(%edx) +L(write_23bytes): + movl %eax, -23(%edx) +L(write_19bytes): + movl %eax, -19(%edx) +L(write_15bytes): + movl %eax, -15(%edx) +L(write_11bytes): + movl %eax, -11(%edx) +L(write_7bytes): + movl %eax, -7(%edx) +L(write_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(32bytesormore): + /* Fill xmm0 with the pattern. */ +#ifdef USE_AS_BZERO + pxor %xmm0, %xmm0 +#else + movd %eax, %xmm0 + pshufd $0, %xmm0, %xmm0 +#endif + testl $0xf, %edx + jz L(aligned_16) +/* ECX > 32 and EDX is not 16 byte aligned. */ +L(not_aligned_16): + movdqu %xmm0, (%edx) + movl %edx, %eax + and $-16, %edx + add $16, %edx + sub %edx, %eax + add %eax, %ecx + movd %xmm0, %eax + + ALIGN (4) +L(aligned_16): + cmp $128, %ecx + jae L(128bytesormore) + +L(aligned_16_less128bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytesormore): + PUSH (%edi) +#ifdef DATA_CACHE_SIZE + PUSH (%ebx) + mov $DATA_CACHE_SIZE, %ebx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_data_cache_size@GOTOFF(%ebx), %ebx +# else + PUSH (%ebx) + mov __x86_data_cache_size, %ebx +# endif +#endif + mov %ebx, %edi + shr $4, %ebx + sub %ebx, %edi +#if defined DATA_CACHE_SIZE || !defined SHARED + POP (%ebx) +#endif +/* + * When data size approximate the end of L1 cache, + * fast string will prefetch and combine data efficiently. + */ + cmp %edi, %ecx + jae L(128bytesormore_endof_L1) + subl $128, %ecx +L(128bytesormore_normal): + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jb L(128bytesless_normal) + + + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jae L(128bytesormore_normal) + +L(128bytesless_normal): + POP (%edi) + add $128, %ecx + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + CFI_PUSH (%edi) + ALIGN (4) +L(128bytesormore_endof_L1): + mov %edx, %edi + mov %ecx, %edx + shr $2, %ecx + and $3, %edx + rep stosl + jz L(copy_page_by_rep_exit) + cmp $2, %edx + jb L(copy_page_by_rep_left_1) + movw %ax, (%edi) + add $2, %edi + sub $2, %edx + jz L(copy_page_by_rep_exit) +L(copy_page_by_rep_left_1): + movb %al, (%edi) +L(copy_page_by_rep_exit): + POP (%edi) + SETRTNVAL + RETURN + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_16_128bytes): + .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) + .popsection + + ALIGN (4) +L(aligned_16_112bytes): + movdqa %xmm0, -112(%edx) +L(aligned_16_96bytes): + movdqa %xmm0, -96(%edx) +L(aligned_16_80bytes): + movdqa %xmm0, -80(%edx) +L(aligned_16_64bytes): + movdqa %xmm0, -64(%edx) +L(aligned_16_48bytes): + movdqa %xmm0, -48(%edx) +L(aligned_16_32bytes): + movdqa %xmm0, -32(%edx) +L(aligned_16_16bytes): + movdqa %xmm0, -16(%edx) +L(aligned_16_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_113bytes): + movdqa %xmm0, -113(%edx) +L(aligned_16_97bytes): + movdqa %xmm0, -97(%edx) +L(aligned_16_81bytes): + movdqa %xmm0, -81(%edx) +L(aligned_16_65bytes): + movdqa %xmm0, -65(%edx) +L(aligned_16_49bytes): + movdqa %xmm0, -49(%edx) +L(aligned_16_33bytes): + movdqa %xmm0, -33(%edx) +L(aligned_16_17bytes): + movdqa %xmm0, -17(%edx) +L(aligned_16_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_114bytes): + movdqa %xmm0, -114(%edx) +L(aligned_16_98bytes): + movdqa %xmm0, -98(%edx) +L(aligned_16_82bytes): + movdqa %xmm0, -82(%edx) +L(aligned_16_66bytes): + movdqa %xmm0, -66(%edx) +L(aligned_16_50bytes): + movdqa %xmm0, -50(%edx) +L(aligned_16_34bytes): + movdqa %xmm0, -34(%edx) +L(aligned_16_18bytes): + movdqa %xmm0, -18(%edx) +L(aligned_16_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_115bytes): + movdqa %xmm0, -115(%edx) +L(aligned_16_99bytes): + movdqa %xmm0, -99(%edx) +L(aligned_16_83bytes): + movdqa %xmm0, -83(%edx) +L(aligned_16_67bytes): + movdqa %xmm0, -67(%edx) +L(aligned_16_51bytes): + movdqa %xmm0, -51(%edx) +L(aligned_16_35bytes): + movdqa %xmm0, -35(%edx) +L(aligned_16_19bytes): + movdqa %xmm0, -19(%edx) +L(aligned_16_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_116bytes): + movdqa %xmm0, -116(%edx) +L(aligned_16_100bytes): + movdqa %xmm0, -100(%edx) +L(aligned_16_84bytes): + movdqa %xmm0, -84(%edx) +L(aligned_16_68bytes): + movdqa %xmm0, -68(%edx) +L(aligned_16_52bytes): + movdqa %xmm0, -52(%edx) +L(aligned_16_36bytes): + movdqa %xmm0, -36(%edx) +L(aligned_16_20bytes): + movdqa %xmm0, -20(%edx) +L(aligned_16_4bytes): + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_117bytes): + movdqa %xmm0, -117(%edx) +L(aligned_16_101bytes): + movdqa %xmm0, -101(%edx) +L(aligned_16_85bytes): + movdqa %xmm0, -85(%edx) +L(aligned_16_69bytes): + movdqa %xmm0, -69(%edx) +L(aligned_16_53bytes): + movdqa %xmm0, -53(%edx) +L(aligned_16_37bytes): + movdqa %xmm0, -37(%edx) +L(aligned_16_21bytes): + movdqa %xmm0, -21(%edx) +L(aligned_16_5bytes): + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_118bytes): + movdqa %xmm0, -118(%edx) +L(aligned_16_102bytes): + movdqa %xmm0, -102(%edx) +L(aligned_16_86bytes): + movdqa %xmm0, -86(%edx) +L(aligned_16_70bytes): + movdqa %xmm0, -70(%edx) +L(aligned_16_54bytes): + movdqa %xmm0, -54(%edx) +L(aligned_16_38bytes): + movdqa %xmm0, -38(%edx) +L(aligned_16_22bytes): + movdqa %xmm0, -22(%edx) +L(aligned_16_6bytes): + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_119bytes): + movdqa %xmm0, -119(%edx) +L(aligned_16_103bytes): + movdqa %xmm0, -103(%edx) +L(aligned_16_87bytes): + movdqa %xmm0, -87(%edx) +L(aligned_16_71bytes): + movdqa %xmm0, -71(%edx) +L(aligned_16_55bytes): + movdqa %xmm0, -55(%edx) +L(aligned_16_39bytes): + movdqa %xmm0, -39(%edx) +L(aligned_16_23bytes): + movdqa %xmm0, -23(%edx) +L(aligned_16_7bytes): + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_120bytes): + movdqa %xmm0, -120(%edx) +L(aligned_16_104bytes): + movdqa %xmm0, -104(%edx) +L(aligned_16_88bytes): + movdqa %xmm0, -88(%edx) +L(aligned_16_72bytes): + movdqa %xmm0, -72(%edx) +L(aligned_16_56bytes): + movdqa %xmm0, -56(%edx) +L(aligned_16_40bytes): + movdqa %xmm0, -40(%edx) +L(aligned_16_24bytes): + movdqa %xmm0, -24(%edx) +L(aligned_16_8bytes): + movq %xmm0, -8(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_121bytes): + movdqa %xmm0, -121(%edx) +L(aligned_16_105bytes): + movdqa %xmm0, -105(%edx) +L(aligned_16_89bytes): + movdqa %xmm0, -89(%edx) +L(aligned_16_73bytes): + movdqa %xmm0, -73(%edx) +L(aligned_16_57bytes): + movdqa %xmm0, -57(%edx) +L(aligned_16_41bytes): + movdqa %xmm0, -41(%edx) +L(aligned_16_25bytes): + movdqa %xmm0, -25(%edx) +L(aligned_16_9bytes): + movq %xmm0, -9(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_122bytes): + movdqa %xmm0, -122(%edx) +L(aligned_16_106bytes): + movdqa %xmm0, -106(%edx) +L(aligned_16_90bytes): + movdqa %xmm0, -90(%edx) +L(aligned_16_74bytes): + movdqa %xmm0, -74(%edx) +L(aligned_16_58bytes): + movdqa %xmm0, -58(%edx) +L(aligned_16_42bytes): + movdqa %xmm0, -42(%edx) +L(aligned_16_26bytes): + movdqa %xmm0, -26(%edx) +L(aligned_16_10bytes): + movq %xmm0, -10(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_123bytes): + movdqa %xmm0, -123(%edx) +L(aligned_16_107bytes): + movdqa %xmm0, -107(%edx) +L(aligned_16_91bytes): + movdqa %xmm0, -91(%edx) +L(aligned_16_75bytes): + movdqa %xmm0, -75(%edx) +L(aligned_16_59bytes): + movdqa %xmm0, -59(%edx) +L(aligned_16_43bytes): + movdqa %xmm0, -43(%edx) +L(aligned_16_27bytes): + movdqa %xmm0, -27(%edx) +L(aligned_16_11bytes): + movq %xmm0, -11(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_124bytes): + movdqa %xmm0, -124(%edx) +L(aligned_16_108bytes): + movdqa %xmm0, -108(%edx) +L(aligned_16_92bytes): + movdqa %xmm0, -92(%edx) +L(aligned_16_76bytes): + movdqa %xmm0, -76(%edx) +L(aligned_16_60bytes): + movdqa %xmm0, -60(%edx) +L(aligned_16_44bytes): + movdqa %xmm0, -44(%edx) +L(aligned_16_28bytes): + movdqa %xmm0, -28(%edx) +L(aligned_16_12bytes): + movq %xmm0, -12(%edx) + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_125bytes): + movdqa %xmm0, -125(%edx) +L(aligned_16_109bytes): + movdqa %xmm0, -109(%edx) +L(aligned_16_93bytes): + movdqa %xmm0, -93(%edx) +L(aligned_16_77bytes): + movdqa %xmm0, -77(%edx) +L(aligned_16_61bytes): + movdqa %xmm0, -61(%edx) +L(aligned_16_45bytes): + movdqa %xmm0, -45(%edx) +L(aligned_16_29bytes): + movdqa %xmm0, -29(%edx) +L(aligned_16_13bytes): + movq %xmm0, -13(%edx) + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_126bytes): + movdqa %xmm0, -126(%edx) +L(aligned_16_110bytes): + movdqa %xmm0, -110(%edx) +L(aligned_16_94bytes): + movdqa %xmm0, -94(%edx) +L(aligned_16_78bytes): + movdqa %xmm0, -78(%edx) +L(aligned_16_62bytes): + movdqa %xmm0, -62(%edx) +L(aligned_16_46bytes): + movdqa %xmm0, -46(%edx) +L(aligned_16_30bytes): + movdqa %xmm0, -30(%edx) +L(aligned_16_14bytes): + movq %xmm0, -14(%edx) + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_127bytes): + movdqa %xmm0, -127(%edx) +L(aligned_16_111bytes): + movdqa %xmm0, -111(%edx) +L(aligned_16_95bytes): + movdqa %xmm0, -95(%edx) +L(aligned_16_79bytes): + movdqa %xmm0, -79(%edx) +L(aligned_16_63bytes): + movdqa %xmm0, -63(%edx) +L(aligned_16_47bytes): + movdqa %xmm0, -47(%edx) +L(aligned_16_31bytes): + movdqa %xmm0, -31(%edx) +L(aligned_16_15bytes): + movq %xmm0, -15(%edx) + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN_END + +END (__memset_sse2_rep) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S new file mode 100644 index 0000000000..d7b8be9114 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset-sse2.S @@ -0,0 +1,860 @@ +/* memset with SSE2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_BZERO +# define DEST PARMS +# define LEN DEST+4 +# define SETRTNVAL +#else +# define DEST PARMS +# define CHR DEST+4 +# define LEN CHR+4 +# define SETRTNVAL movl DEST(%esp), %eax +#endif + +#ifdef SHARED +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define PARMS 8 /* Preserve EBX. */ +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into EBX and branch to it. TABLE is a + jump table with relative offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + add $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + add (%ebx,%ecx,4), %ebx; \ + add %ecx, %edx; \ + /* We loaded the jump table and adjusted EDX. Go. */ \ + jmp *%ebx +#else +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define PARMS 4 +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. */ +# define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ + add %ecx, %edx; \ + jmp *TABLE(,%ecx,4) +#endif + + .section .text.sse2,"ax",@progbits +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO +ENTRY (__memset_chk_sse2) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk_sse2) +#endif +ENTRY (__memset_sse2) + ENTRANCE + + movl LEN(%esp), %ecx +#ifdef USE_AS_BZERO + xor %eax, %eax +#else + movzbl CHR(%esp), %eax + movb %al, %ah + /* Fill the whole EAX with pattern. */ + movl %eax, %edx + shl $16, %eax + or %edx, %eax +#endif + movl DEST(%esp), %edx + cmp $32, %ecx + jae L(32bytesormore) + +L(write_less32bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_less_32bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_less_32bytes): + .int JMPTBL (L(write_0bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_1bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_2bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_3bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_4bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_5bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_6bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_7bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_8bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_9bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_10bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_11bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_12bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_13bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_14bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_15bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_16bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_17bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_18bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_19bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_20bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_21bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_22bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_23bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_24bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_25bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_26bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_27bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_28bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_29bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_30bytes), L(table_less_32bytes)) + .int JMPTBL (L(write_31bytes), L(table_less_32bytes)) + .popsection + + ALIGN (4) +L(write_28bytes): + movl %eax, -28(%edx) +L(write_24bytes): + movl %eax, -24(%edx) +L(write_20bytes): + movl %eax, -20(%edx) +L(write_16bytes): + movl %eax, -16(%edx) +L(write_12bytes): + movl %eax, -12(%edx) +L(write_8bytes): + movl %eax, -8(%edx) +L(write_4bytes): + movl %eax, -4(%edx) +L(write_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(write_29bytes): + movl %eax, -29(%edx) +L(write_25bytes): + movl %eax, -25(%edx) +L(write_21bytes): + movl %eax, -21(%edx) +L(write_17bytes): + movl %eax, -17(%edx) +L(write_13bytes): + movl %eax, -13(%edx) +L(write_9bytes): + movl %eax, -9(%edx) +L(write_5bytes): + movl %eax, -5(%edx) +L(write_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_30bytes): + movl %eax, -30(%edx) +L(write_26bytes): + movl %eax, -26(%edx) +L(write_22bytes): + movl %eax, -22(%edx) +L(write_18bytes): + movl %eax, -18(%edx) +L(write_14bytes): + movl %eax, -14(%edx) +L(write_10bytes): + movl %eax, -10(%edx) +L(write_6bytes): + movl %eax, -6(%edx) +L(write_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(write_31bytes): + movl %eax, -31(%edx) +L(write_27bytes): + movl %eax, -27(%edx) +L(write_23bytes): + movl %eax, -23(%edx) +L(write_19bytes): + movl %eax, -19(%edx) +L(write_15bytes): + movl %eax, -15(%edx) +L(write_11bytes): + movl %eax, -11(%edx) +L(write_7bytes): + movl %eax, -7(%edx) +L(write_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +/* ECX > 32 and EDX is 4 byte aligned. */ +L(32bytesormore): + /* Fill xmm0 with the pattern. */ +#ifdef USE_AS_BZERO + pxor %xmm0, %xmm0 +#else + movd %eax, %xmm0 + pshufd $0, %xmm0, %xmm0 +#endif + testl $0xf, %edx + jz L(aligned_16) +/* ECX > 32 and EDX is not 16 byte aligned. */ +L(not_aligned_16): + movdqu %xmm0, (%edx) + movl %edx, %eax + and $-16, %edx + add $16, %edx + sub %edx, %eax + add %eax, %ecx + movd %xmm0, %eax + + ALIGN (4) +L(aligned_16): + cmp $128, %ecx + jae L(128bytesormore) + +L(aligned_16_less128bytes): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytesormore): +#ifdef SHARED_CACHE_SIZE + PUSH (%ebx) + mov $SHARED_CACHE_SIZE, %ebx +#else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx +# else + PUSH (%ebx) + mov __x86_shared_cache_size, %ebx +# endif +#endif + cmp %ebx, %ecx + jae L(128bytesormore_nt_start) + + +#ifdef DATA_CACHE_SIZE + POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) + cmp $DATA_CACHE_SIZE, %ecx +#else +# ifdef SHARED +# define RESTORE_EBX_STATE + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx +# else + POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) + cmp __x86_data_cache_size, %ecx +# endif +#endif + + jae L(128bytes_L2_normal) + subl $128, %ecx +L(128bytesormore_normal): + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jb L(128bytesless_normal) + + + sub $128, %ecx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + lea 128(%edx), %edx + jae L(128bytesormore_normal) + +L(128bytesless_normal): + add $128, %ecx + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + ALIGN (4) +L(128bytes_L2_normal): + prefetcht0 0x380(%edx) + prefetcht0 0x3c0(%edx) + sub $128, %ecx + movdqa %xmm0, (%edx) + movaps %xmm0, 0x10(%edx) + movaps %xmm0, 0x20(%edx) + movaps %xmm0, 0x30(%edx) + movaps %xmm0, 0x40(%edx) + movaps %xmm0, 0x50(%edx) + movaps %xmm0, 0x60(%edx) + movaps %xmm0, 0x70(%edx) + add $128, %edx + cmp $128, %ecx + jae L(128bytes_L2_normal) + +L(128bytesless_L2_normal): + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + RESTORE_EBX_STATE +L(128bytesormore_nt_start): + sub %ebx, %ecx + ALIGN (4) +L(128bytesormore_shared_cache_loop): + prefetcht0 0x3c0(%edx) + prefetcht0 0x380(%edx) + sub $0x80, %ebx + movdqa %xmm0, (%edx) + movdqa %xmm0, 0x10(%edx) + movdqa %xmm0, 0x20(%edx) + movdqa %xmm0, 0x30(%edx) + movdqa %xmm0, 0x40(%edx) + movdqa %xmm0, 0x50(%edx) + movdqa %xmm0, 0x60(%edx) + movdqa %xmm0, 0x70(%edx) + add $0x80, %edx + cmp $0x80, %ebx + jae L(128bytesormore_shared_cache_loop) + cmp $0x80, %ecx + jb L(shared_cache_loop_end) + ALIGN (4) +L(128bytesormore_nt): + sub $0x80, %ecx + movntdq %xmm0, (%edx) + movntdq %xmm0, 0x10(%edx) + movntdq %xmm0, 0x20(%edx) + movntdq %xmm0, 0x30(%edx) + movntdq %xmm0, 0x40(%edx) + movntdq %xmm0, 0x50(%edx) + movntdq %xmm0, 0x60(%edx) + movntdq %xmm0, 0x70(%edx) + add $0x80, %edx + cmp $0x80, %ecx + jae L(128bytesormore_nt) + sfence +L(shared_cache_loop_end): +#if defined DATA_CACHE_SIZE || !defined SHARED + POP (%ebx) +#endif + BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + + + .pushsection .rodata.sse2,"a",@progbits + ALIGN (2) +L(table_16_128bytes): + .int JMPTBL (L(aligned_16_0bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_1bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_2bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_3bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_4bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_5bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_6bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_7bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_8bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_9bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_10bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_11bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_12bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_13bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_14bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_15bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_16bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_17bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_18bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_19bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_20bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_21bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_22bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_23bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_24bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_25bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_26bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_27bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_28bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_29bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_30bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_31bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_32bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_33bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_34bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_35bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_36bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_37bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_38bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_39bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_40bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_41bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_42bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_43bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_44bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_45bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_46bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_47bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_48bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_49bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_50bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_51bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_52bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_53bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_54bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_55bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_56bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_57bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_58bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_59bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_60bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_61bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_62bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_63bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_64bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_65bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_66bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_67bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_68bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_69bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_70bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_71bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_72bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_73bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_74bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_75bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_76bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_77bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_78bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_79bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_80bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_81bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_82bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_83bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_84bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_85bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_86bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_87bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_88bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_89bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_90bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_91bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_92bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_93bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_94bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_95bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_96bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_97bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_98bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_99bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_100bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_101bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_102bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_103bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_104bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_105bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_106bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_107bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_108bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_109bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_110bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_111bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_112bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_113bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_114bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_115bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_116bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_117bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_118bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_119bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_120bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_121bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_122bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_123bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_124bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_125bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_126bytes), L(table_16_128bytes)) + .int JMPTBL (L(aligned_16_127bytes), L(table_16_128bytes)) + .popsection + + ALIGN (4) +L(aligned_16_112bytes): + movdqa %xmm0, -112(%edx) +L(aligned_16_96bytes): + movdqa %xmm0, -96(%edx) +L(aligned_16_80bytes): + movdqa %xmm0, -80(%edx) +L(aligned_16_64bytes): + movdqa %xmm0, -64(%edx) +L(aligned_16_48bytes): + movdqa %xmm0, -48(%edx) +L(aligned_16_32bytes): + movdqa %xmm0, -32(%edx) +L(aligned_16_16bytes): + movdqa %xmm0, -16(%edx) +L(aligned_16_0bytes): + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_113bytes): + movdqa %xmm0, -113(%edx) +L(aligned_16_97bytes): + movdqa %xmm0, -97(%edx) +L(aligned_16_81bytes): + movdqa %xmm0, -81(%edx) +L(aligned_16_65bytes): + movdqa %xmm0, -65(%edx) +L(aligned_16_49bytes): + movdqa %xmm0, -49(%edx) +L(aligned_16_33bytes): + movdqa %xmm0, -33(%edx) +L(aligned_16_17bytes): + movdqa %xmm0, -17(%edx) +L(aligned_16_1bytes): + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_114bytes): + movdqa %xmm0, -114(%edx) +L(aligned_16_98bytes): + movdqa %xmm0, -98(%edx) +L(aligned_16_82bytes): + movdqa %xmm0, -82(%edx) +L(aligned_16_66bytes): + movdqa %xmm0, -66(%edx) +L(aligned_16_50bytes): + movdqa %xmm0, -50(%edx) +L(aligned_16_34bytes): + movdqa %xmm0, -34(%edx) +L(aligned_16_18bytes): + movdqa %xmm0, -18(%edx) +L(aligned_16_2bytes): + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_115bytes): + movdqa %xmm0, -115(%edx) +L(aligned_16_99bytes): + movdqa %xmm0, -99(%edx) +L(aligned_16_83bytes): + movdqa %xmm0, -83(%edx) +L(aligned_16_67bytes): + movdqa %xmm0, -67(%edx) +L(aligned_16_51bytes): + movdqa %xmm0, -51(%edx) +L(aligned_16_35bytes): + movdqa %xmm0, -35(%edx) +L(aligned_16_19bytes): + movdqa %xmm0, -19(%edx) +L(aligned_16_3bytes): + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_116bytes): + movdqa %xmm0, -116(%edx) +L(aligned_16_100bytes): + movdqa %xmm0, -100(%edx) +L(aligned_16_84bytes): + movdqa %xmm0, -84(%edx) +L(aligned_16_68bytes): + movdqa %xmm0, -68(%edx) +L(aligned_16_52bytes): + movdqa %xmm0, -52(%edx) +L(aligned_16_36bytes): + movdqa %xmm0, -36(%edx) +L(aligned_16_20bytes): + movdqa %xmm0, -20(%edx) +L(aligned_16_4bytes): + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_117bytes): + movdqa %xmm0, -117(%edx) +L(aligned_16_101bytes): + movdqa %xmm0, -101(%edx) +L(aligned_16_85bytes): + movdqa %xmm0, -85(%edx) +L(aligned_16_69bytes): + movdqa %xmm0, -69(%edx) +L(aligned_16_53bytes): + movdqa %xmm0, -53(%edx) +L(aligned_16_37bytes): + movdqa %xmm0, -37(%edx) +L(aligned_16_21bytes): + movdqa %xmm0, -21(%edx) +L(aligned_16_5bytes): + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_118bytes): + movdqa %xmm0, -118(%edx) +L(aligned_16_102bytes): + movdqa %xmm0, -102(%edx) +L(aligned_16_86bytes): + movdqa %xmm0, -86(%edx) +L(aligned_16_70bytes): + movdqa %xmm0, -70(%edx) +L(aligned_16_54bytes): + movdqa %xmm0, -54(%edx) +L(aligned_16_38bytes): + movdqa %xmm0, -38(%edx) +L(aligned_16_22bytes): + movdqa %xmm0, -22(%edx) +L(aligned_16_6bytes): + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_119bytes): + movdqa %xmm0, -119(%edx) +L(aligned_16_103bytes): + movdqa %xmm0, -103(%edx) +L(aligned_16_87bytes): + movdqa %xmm0, -87(%edx) +L(aligned_16_71bytes): + movdqa %xmm0, -71(%edx) +L(aligned_16_55bytes): + movdqa %xmm0, -55(%edx) +L(aligned_16_39bytes): + movdqa %xmm0, -39(%edx) +L(aligned_16_23bytes): + movdqa %xmm0, -23(%edx) +L(aligned_16_7bytes): + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_120bytes): + movdqa %xmm0, -120(%edx) +L(aligned_16_104bytes): + movdqa %xmm0, -104(%edx) +L(aligned_16_88bytes): + movdqa %xmm0, -88(%edx) +L(aligned_16_72bytes): + movdqa %xmm0, -72(%edx) +L(aligned_16_56bytes): + movdqa %xmm0, -56(%edx) +L(aligned_16_40bytes): + movdqa %xmm0, -40(%edx) +L(aligned_16_24bytes): + movdqa %xmm0, -24(%edx) +L(aligned_16_8bytes): + movq %xmm0, -8(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_121bytes): + movdqa %xmm0, -121(%edx) +L(aligned_16_105bytes): + movdqa %xmm0, -105(%edx) +L(aligned_16_89bytes): + movdqa %xmm0, -89(%edx) +L(aligned_16_73bytes): + movdqa %xmm0, -73(%edx) +L(aligned_16_57bytes): + movdqa %xmm0, -57(%edx) +L(aligned_16_41bytes): + movdqa %xmm0, -41(%edx) +L(aligned_16_25bytes): + movdqa %xmm0, -25(%edx) +L(aligned_16_9bytes): + movq %xmm0, -9(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_122bytes): + movdqa %xmm0, -122(%edx) +L(aligned_16_106bytes): + movdqa %xmm0, -106(%edx) +L(aligned_16_90bytes): + movdqa %xmm0, -90(%edx) +L(aligned_16_74bytes): + movdqa %xmm0, -74(%edx) +L(aligned_16_58bytes): + movdqa %xmm0, -58(%edx) +L(aligned_16_42bytes): + movdqa %xmm0, -42(%edx) +L(aligned_16_26bytes): + movdqa %xmm0, -26(%edx) +L(aligned_16_10bytes): + movq %xmm0, -10(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_123bytes): + movdqa %xmm0, -123(%edx) +L(aligned_16_107bytes): + movdqa %xmm0, -107(%edx) +L(aligned_16_91bytes): + movdqa %xmm0, -91(%edx) +L(aligned_16_75bytes): + movdqa %xmm0, -75(%edx) +L(aligned_16_59bytes): + movdqa %xmm0, -59(%edx) +L(aligned_16_43bytes): + movdqa %xmm0, -43(%edx) +L(aligned_16_27bytes): + movdqa %xmm0, -27(%edx) +L(aligned_16_11bytes): + movq %xmm0, -11(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_124bytes): + movdqa %xmm0, -124(%edx) +L(aligned_16_108bytes): + movdqa %xmm0, -108(%edx) +L(aligned_16_92bytes): + movdqa %xmm0, -92(%edx) +L(aligned_16_76bytes): + movdqa %xmm0, -76(%edx) +L(aligned_16_60bytes): + movdqa %xmm0, -60(%edx) +L(aligned_16_44bytes): + movdqa %xmm0, -44(%edx) +L(aligned_16_28bytes): + movdqa %xmm0, -28(%edx) +L(aligned_16_12bytes): + movq %xmm0, -12(%edx) + movl %eax, -4(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_125bytes): + movdqa %xmm0, -125(%edx) +L(aligned_16_109bytes): + movdqa %xmm0, -109(%edx) +L(aligned_16_93bytes): + movdqa %xmm0, -93(%edx) +L(aligned_16_77bytes): + movdqa %xmm0, -77(%edx) +L(aligned_16_61bytes): + movdqa %xmm0, -61(%edx) +L(aligned_16_45bytes): + movdqa %xmm0, -45(%edx) +L(aligned_16_29bytes): + movdqa %xmm0, -29(%edx) +L(aligned_16_13bytes): + movq %xmm0, -13(%edx) + movl %eax, -5(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_126bytes): + movdqa %xmm0, -126(%edx) +L(aligned_16_110bytes): + movdqa %xmm0, -110(%edx) +L(aligned_16_94bytes): + movdqa %xmm0, -94(%edx) +L(aligned_16_78bytes): + movdqa %xmm0, -78(%edx) +L(aligned_16_62bytes): + movdqa %xmm0, -62(%edx) +L(aligned_16_46bytes): + movdqa %xmm0, -46(%edx) +L(aligned_16_30bytes): + movdqa %xmm0, -30(%edx) +L(aligned_16_14bytes): + movq %xmm0, -14(%edx) + movl %eax, -6(%edx) + movw %ax, -2(%edx) + SETRTNVAL + RETURN + + ALIGN (4) +L(aligned_16_127bytes): + movdqa %xmm0, -127(%edx) +L(aligned_16_111bytes): + movdqa %xmm0, -111(%edx) +L(aligned_16_95bytes): + movdqa %xmm0, -95(%edx) +L(aligned_16_79bytes): + movdqa %xmm0, -79(%edx) +L(aligned_16_63bytes): + movdqa %xmm0, -63(%edx) +L(aligned_16_47bytes): + movdqa %xmm0, -47(%edx) +L(aligned_16_31bytes): + movdqa %xmm0, -31(%edx) +L(aligned_16_15bytes): + movq %xmm0, -15(%edx) + movl %eax, -7(%edx) + movw %ax, -3(%edx) + movb %al, -1(%edx) + SETRTNVAL + RETURN_END + +END (__memset_sse2) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S new file mode 100644 index 0000000000..f601663a9f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset.S @@ -0,0 +1,75 @@ +/* Multiple versions of memset + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(memset) + .type memset, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memset_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_sse2) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_sse2_rep) +2: ret +END(memset) + +# undef ENTRY +# define ENTRY(name) \ + .type __memset_ia32, @function; \ + .globl __memset_ia32; \ + .p2align 4; \ + __memset_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memset_ia32, .-__memset_ia32 + +# undef ENTRY_CHK +# define ENTRY_CHK(name) \ + .type __memset_chk_ia32, @function; \ + .globl __memset_chk_ia32; \ + .p2align 4; \ + __memset_chk_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END_CHK +# define END_CHK(name) \ + cfi_endproc; .size __memset_chk_ia32, .-__memset_chk_ia32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_memset; __GI_memset = __memset_ia32 +# endif + +# undef strong_alias +# define strong_alias(original, alias) +#endif + +#include "../memset.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S new file mode 100644 index 0000000000..573cf4208a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/memset_chk.S @@ -0,0 +1,82 @@ +/* Multiple versions of __memset_chk + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in lib. */ +#if IS_IN (libc) + .text +ENTRY(__memset_chk) + .type __memset_chk, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__memset_chk_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_chk_sse2) + HAS_ARCH_FEATURE (Fast_Rep_String) + jz 2f + LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep) +2: ret +END(__memset_chk) + +# ifdef SHARED +strong_alias (__memset_chk, __memset_zero_constant_len_parameter) + .section .gnu.warning.__memset_zero_constant_len_parameter + .string "memset used with constant zero length parameter; this could be due to transposed parameters" +# else + .text + .type __memset_chk_sse2, @function + .p2align 4; +__memset_chk_sse2: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_sse2 + cfi_endproc + .size __memset_chk_sse2, .-__memset_chk_sse2 + + .type __memset_chk_sse2_rep, @function + .p2align 4; +__memset_chk_sse2_rep: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_sse2_rep + cfi_endproc + .size __memset_chk_sse2_rep, .-__memset_chk_sse2_rep + + .type __memset_chk_ia32, @function + .p2align 4; +__memset_chk_ia32: + cfi_startproc + CALL_MCOUNT + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp __memset_ia32 + cfi_endproc + .size __memset_chk_ia32, .-__memset_chk_ia32 +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S new file mode 100644 index 0000000000..88c0e5776c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2-bsf.S @@ -0,0 +1,3 @@ +#define USE_AS_RAWMEMCHR +#define MEMCHR __rawmemchr_sse2_bsf +#include "memchr-sse2-bsf.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S new file mode 100644 index 0000000000..038c74896b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_RAWMEMCHR +#define MEMCHR __rawmemchr_sse2 +#include "memchr-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S new file mode 100644 index 0000000000..0a41d63ee8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rawmemchr.S @@ -0,0 +1,65 @@ +/* Multiple versions of rawmemchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__rawmemchr) + .type __rawmemchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + HAS_CPU_FEATURE (SSE2) + jz 2f + HAS_ARCH_FEATURE (Slow_BSF) + jz 3f + + LOAD_FUNC_GOT_EAX (__rawmemchr_sse2) + ret + +2: LOAD_FUNC_GOT_EAX (__rawmemchr_ia32) + ret + +3: LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf) + ret +END(__rawmemchr) + +weak_alias(__rawmemchr, rawmemchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __rawmemchr_ia32, @function; \ + .globl __rawmemchr_ia32; \ + .p2align 4; \ + __rawmemchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __rawmemchr_ia32, .-__rawmemchr_ia32 + +# undef libc_hidden_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_def(name) \ + .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_ia32 + +#endif +#include "../../rawmemchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c new file mode 100644 index 0000000000..1aa5440644 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/rtld-strnlen.c @@ -0,0 +1 @@ +#include <string/strnlen.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c new file mode 100644 index 0000000000..2e9619f97c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma-fma.c @@ -0,0 +1,27 @@ +/* FMA version of fma. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +double +__fma_fma (double x, double y, double z) +{ + asm ("vfmadd213sd %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c new file mode 100644 index 0000000000..411ebb2ba9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fma.c @@ -0,0 +1,34 @@ +/* Multiple versions of fma. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <math.h> +#include <init-arch.h> + +extern double __fma_ia32 (double x, double y, double z) attribute_hidden; +extern double __fma_fma (double x, double y, double z) attribute_hidden; + +libm_ifunc (__fma, + HAS_ARCH_FEATURE (FMA_Usable) ? __fma_fma : __fma_ia32); +weak_alias (__fma, fma) + +#define __fma __fma_ia32 + +#include <sysdeps/ieee754/ldbl-96/s_fma.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c new file mode 100644 index 0000000000..ee57abfda2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf-fma.c @@ -0,0 +1,27 @@ +/* FMA version of fmaf. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +float +__fmaf_fma (float x, float y, float z) +{ + asm ("vfmadd213ss %3, %2, %0" : "=x" (x) : "0" (x), "x" (y), "xm" (z)); + return x; +} diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c new file mode 100644 index 0000000000..00b0fbcfc5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/s_fmaf.c @@ -0,0 +1,34 @@ +/* Multiple versions of fmaf. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> + +#include <math.h> +#include <init-arch.h> + +extern float __fmaf_ia32 (float x, float y, float z) attribute_hidden; +extern float __fmaf_fma (float x, float y, float z) attribute_hidden; + +libm_ifunc (__fmaf, + HAS_ARCH_FEATURE (FMA_Usable) ? __fmaf_fma : __fmaf_ia32); +weak_alias (__fmaf, fmaf) + +#define __fmaf __fmaf_ia32 + +#include <sysdeps/ieee754/dbl-64/s_fmaf.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c new file mode 100644 index 0000000000..7db31b02f8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/sched_cpucount.c @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/sched_cpucount.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S new file mode 100644 index 0000000000..46ca1b3074 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_sse2 +#include "strcpy-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S new file mode 100644 index 0000000000..d971c2da38 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S new file mode 100644 index 0000000000..ee81ab6ae3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpcpy.S @@ -0,0 +1,9 @@ +/* Multiple versions of stpcpy + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S new file mode 100644 index 0000000000..37a703cb76 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-sse2.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_sse2 +#include "strcpy-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S new file mode 100644 index 0000000000..14ed16f6b5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S new file mode 100644 index 0000000000..2698ca6a8c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/stpncpy.S @@ -0,0 +1,8 @@ +/* Multiple versions of stpncpy + All versions must be listed in ifunc-impl-list.c. */ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c new file mode 100644 index 0000000000..753c6ec84a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp-c.c @@ -0,0 +1,12 @@ +#include <string.h> + +extern __typeof (strcasecmp) __strcasecmp_nonascii; + +#define __strcasecmp __strcasecmp_nonascii +#include <string/strcasecmp.c> + +strong_alias (__strcasecmp_nonascii, __strcasecmp_ia32) + +/* The needs of strcasecmp in libc are minimal, no need to go through + the IFUNC. */ +strong_alias (__strcasecmp_nonascii, __GI___strcasecmp) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S new file mode 100644 index 0000000000..ec59276408 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp.S @@ -0,0 +1,39 @@ +/* Entry point for multi-version x86 strcasecmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY(__strcasecmp) + .type __strcasecmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strcasecmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2) +2: ret +END(__strcasecmp) + +weak_alias (__strcasecmp, strcasecmp) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c new file mode 100644 index 0000000000..d4fcd2b4a1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-c.c @@ -0,0 +1,13 @@ +#include <string.h> + +extern __typeof (strcasecmp_l) __strcasecmp_l_nonascii; + +#define __strcasecmp_l __strcasecmp_l_nonascii +#define USE_IN_EXTENDED_LOCALE_MODEL 1 +#include <string/strcasecmp.c> + +strong_alias (__strcasecmp_l_nonascii, __strcasecmp_l_ia32) + +/* The needs of strcasecmp in libc are minimal, no need to go through + the IFUNC. */ +strong_alias (__strcasecmp_l_nonascii, __GI___strcasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S new file mode 100644 index 0000000000..411d4153f2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-sse4.S @@ -0,0 +1,2 @@ +#define USE_AS_STRCASECMP_L 1 +#include "strcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S new file mode 100644 index 0000000000..a22b93c518 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l-ssse3.S @@ -0,0 +1,2 @@ +#define USE_AS_STRCASECMP_L 1 +#include "strcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S new file mode 100644 index 0000000000..711c09b0dc --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcasecmp_l.S @@ -0,0 +1,7 @@ +/* Multiple versions of strcasecmp_l + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP __strcasecmp_l +#define USE_AS_STRCASECMP_L +#include "strcmp.S" + +weak_alias (__strcasecmp_l, strcasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S new file mode 100644 index 0000000000..6359c7330c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-sse2.S @@ -0,0 +1,1245 @@ +/* strcat with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# include <sysdep.h> + + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + SETUP_PIC_REG(cx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjusted ECX. Go. */ \ + jmp *%ecx +# else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + +# ifndef STRCAT +# define STRCAT __strcat_sse2 +# endif + +# define PARMS 4 +# define STR1 PARMS+4 +# define STR2 STR1+4 + +# ifdef USE_AS_STRNCAT +# define LEN STR2+8 +# define STR3 STR1+4 +# else +# define STR3 STR1 +# endif + +# define USE_AS_STRCAT +# ifdef USE_AS_STRNCAT +# define RETURN POP(%ebx); POP(%esi); ret; CFI_PUSH(%ebx); CFI_PUSH(%esi); +# else +# define RETURN POP(%esi); ret; CFI_PUSH(%esi); +# endif + +.text +ENTRY (STRCAT) + PUSH (%esi) + mov STR1(%esp), %eax + mov STR2(%esp), %esi +# ifdef USE_AS_STRNCAT + PUSH (%ebx) + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) +# endif + cmpb $0, (%esi) + mov %esi, %ecx + mov %eax, %edx + jz L(ExitZero) + + and $63, %ecx + and $63, %edx + cmp $32, %ecx + ja L(StrlenCore7_1) + cmp $48, %edx + ja L(alignment_prolog) + + pxor %xmm0, %xmm0 + pxor %xmm4, %xmm4 + pxor %xmm7, %xmm7 + movdqu (%eax), %xmm1 + movdqu (%esi), %xmm5 + pcmpeqb %xmm1, %xmm0 + movdqu 16(%esi), %xmm6 + pmovmskb %xmm0, %ecx + pcmpeqb %xmm5, %xmm4 + pcmpeqb %xmm6, %xmm7 + test %ecx, %ecx + jnz L(exit_less16_) + mov %eax, %ecx + and $-16, %eax + jmp L(loop_prolog) + +L(alignment_prolog): + pxor %xmm0, %xmm0 + pxor %xmm4, %xmm4 + mov %edx, %ecx + pxor %xmm7, %xmm7 + and $15, %ecx + and $-16, %eax + pcmpeqb (%eax), %xmm0 + movdqu (%esi), %xmm5 + movdqu 16(%esi), %xmm6 + pmovmskb %xmm0, %edx + pcmpeqb %xmm5, %xmm4 + shr %cl, %edx + pcmpeqb %xmm6, %xmm7 + test %edx, %edx + jnz L(exit_less16) + add %eax, %ecx + + pxor %xmm0, %xmm0 +L(loop_prolog): + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + .p2align 4 +L(align16_loop): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop) + bsf %edx, %edx + add %edx, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit16): + bsf %edx, %edx + lea 16(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit32): + bsf %edx, %edx + lea 32(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit48): + bsf %edx, %edx + lea 48(%eax, %edx), %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_less16): + bsf %edx, %edx + add %ecx, %eax + add %edx, %eax + jmp L(StartStrcpyPart) + + .p2align 4 +L(exit_less16_): + bsf %ecx, %ecx + add %ecx, %eax + + .p2align 4 +L(StartStrcpyPart): + pmovmskb %xmm4, %edx +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + movdqu %xmm5, (%eax) + pmovmskb %xmm7, %edx +# ifdef USE_AS_STRNCAT + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + mov %esi, %ecx + and $-16, %esi + and $15, %ecx + pxor %xmm0, %xmm0 +# ifdef USE_AS_STRNCAT + add %ecx, %ebx + sbb %edx, %edx + or %edx, %ebx +# endif + sub %ecx, %eax + jmp L(Unalign16Both) + +L(StrlenCore7_1): + mov %eax, %ecx + pxor %xmm0, %xmm0 + and $15, %ecx + and $-16, %eax + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + shr %cl, %edx + test %edx, %edx + jnz L(exit_less16_1) + add %eax, %ecx + + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + + .p2align 4 +L(align16_loop_1): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16_1) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32_1) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48_1) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop_1) + bsf %edx, %edx + add %edx, %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit16_1): + bsf %edx, %edx + lea 16(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit32_1): + bsf %edx, %edx + lea 32(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit48_1): + bsf %edx, %edx + lea 48(%eax, %edx), %eax + jmp L(StartStrcpyPart_1) + + .p2align 4 +L(exit_less16_1): + bsf %edx, %edx + add %ecx, %eax + add %edx, %eax + + .p2align 4 +L(StartStrcpyPart_1): + mov %esi, %ecx + and $15, %ecx + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + +# ifdef USE_AS_STRNCAT + cmp $48, %ebx + ja L(BigN) +# endif + pcmpeqb (%esi), %xmm1 +# ifdef USE_AS_STRNCAT + add %ecx, %ebx +# endif + pmovmskb %xmm1, %edx + shr %cl, %edx +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STRNCAT + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%eax) + sub %ecx, %eax + + .p2align 4 +L(Unalign16Both): + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) +L(Unalign16BothBigN): + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%eax, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%eax, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%eax, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%eax, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx +# ifdef USE_AS_STRNCAT + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm3, (%eax, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %eax +# ifdef USE_AS_STRNCAT + lea 128(%ebx, %edx), %ebx +# endif + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +# ifdef USE_AS_STRNCAT + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jnz L(Unaligned64Leave) + + .p2align 4 +L(Unaligned64Loop_start): + add $64, %eax + add $64, %esi + movdqu %xmm4, -64(%eax) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%eax) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%eax) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%eax) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx +# ifdef USE_AS_STRNCAT + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) +# endif + test %edx, %edx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%eax) + movdqu %xmm5, 16(%eax) + movdqu %xmm6, 32(%eax) + add $48, %esi + add $48, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +# ifdef USE_AS_STRNCAT + .p2align 4 +L(BigN): + pcmpeqb (%esi), %xmm1 + pmovmskb %xmm1, %edx + shr %cl, %edx + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%eax) + sub %ecx, %eax + sub $48, %ebx + add %ecx, %ebx + + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%eax, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + jmp L(Unalign16BothBigN) +# endif + +/*------------end of main part-------------------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16Bytes): + add %ecx, %eax + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTail): + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %eax +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%eax) + add $16, %esi + add $16, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%eax) + movdqu %xmm5, 16(%eax) + add $32, %esi + add $32, %eax + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +# ifdef USE_AS_STRNCAT + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %eax + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %eax + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %eax + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) + +# endif + +# ifdef USE_AS_STRNCAT + .p2align 4 +L(StrncatExit0): + movb %bh, (%eax) + mov STR3(%esp), %eax + RETURN +# endif + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit1): + movb %bh, 1(%eax) +# endif +L(Exit1): +# ifdef USE_AS_STRNCAT + movb (%esi), %dh +# endif + movb %dh, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit2): + movb %bh, 2(%eax) +# endif +L(Exit2): + movw (%esi), %dx + movw %dx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit3): + movb %bh, 3(%eax) +# endif +L(Exit3): + movw (%esi), %cx + movw %cx, (%eax) +# ifdef USE_AS_STRNCAT + movb 2(%esi), %dh +# endif + movb %dh, 2(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit4): + movb %bh, 4(%eax) +# endif +L(Exit4): + movl (%esi), %edx + movl %edx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit5): + movb %bh, 5(%eax) +# endif +L(Exit5): + movl (%esi), %ecx +# ifdef USE_AS_STRNCAT + movb 4(%esi), %dh +# endif + movb %dh, 4(%eax) + movl %ecx, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit6): + movb %bh, 6(%eax) +# endif +L(Exit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%eax) + movw %dx, 4(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit7): + movb %bh, 7(%eax) +# endif +L(Exit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%eax) + movl %edx, 3(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit8): + movb %bh, 8(%eax) +# endif +L(Exit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit9): + movb %bh, 9(%eax) +# endif +L(Exit9): + movlpd (%esi), %xmm0 +# ifdef USE_AS_STRNCAT + movb 8(%esi), %dh +# endif + movb %dh, 8(%eax) + movlpd %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit10): + movb %bh, 10(%eax) +# endif +L(Exit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%eax) + movw %dx, 8(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit11): + movb %bh, 11(%eax) +# endif +L(Exit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%eax) + movl %edx, 7(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit12): + movb %bh, 12(%eax) +# endif +L(Exit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%eax) + movl %edx, 8(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit13): + movb %bh, 13(%eax) +# endif +L(Exit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 5(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit14): + movb %bh, 14(%eax) +# endif +L(Exit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 6(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit15): + movb %bh, 15(%eax) +# endif +L(Exit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%eax) + movlpd %xmm1, 7(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit16): + movb %bh, 16(%eax) +# endif +L(Exit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit17): + movb %bh, 17(%eax) +# endif +L(Exit17): + movdqu (%esi), %xmm0 +# ifdef USE_AS_STRNCAT + movb 16(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movb %dh, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit18): + movb %bh, 18(%eax) +# endif +L(Exit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%eax) + movw %cx, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit19): + movb %bh, 19(%eax) +# endif +L(Exit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%eax) + movl %ecx, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit20): + movb %bh, 20(%eax) +# endif +L(Exit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%eax) + movl %ecx, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit21): + movb %bh, 21(%eax) +# endif +L(Exit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx +# ifdef USE_AS_STRNCAT + movb 20(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movl %ecx, 16(%eax) + movb %dh, 20(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit22): + movb %bh, 22(%eax) +# endif +L(Exit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%eax) + movlpd %xmm3, 14(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit23): + movb %bh, 23(%eax) +# endif +L(Exit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%eax) + movlpd %xmm3, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit24): + movb %bh, 24(%eax) +# endif +L(Exit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit25): + movb %bh, 25(%eax) +# endif +L(Exit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 +# ifdef USE_AS_STRNCAT + movb 24(%esi), %dh +# endif + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movb %dh, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit26): + movb %bh, 26(%eax) +# endif +L(Exit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movw %cx, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit27): + movb %bh, 27(%eax) +# endif +L(Exit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movl %ecx, 23(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit28): + movb %bh, 28(%eax) +# endif +L(Exit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%eax) + movlpd %xmm2, 16(%eax) + movl %ecx, 24(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit29): + movb %bh, 29(%eax) +# endif +L(Exit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 13(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit30): + movb %bh, 30(%eax) +# endif +L(Exit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 14(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit31): + movb %bh, 31(%eax) +# endif +L(Exit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 15(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +# ifdef USE_AS_STRNCAT +L(StrncatExit32): + movb %bh, 32(%eax) +# endif +L(Exit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%eax) + movdqu %xmm2, 16(%eax) + mov STR3(%esp), %eax + RETURN + +# ifdef USE_AS_STRNCAT + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%eax) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%eax) + xor %bh, %bh + movb %bh, 64(%eax) + mov STR3(%esp), %eax + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm4, (%eax) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm5, 16(%eax) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm6, 32(%eax) + lea 16(%eax, %ecx), %eax + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncatTable), %ebx, 4) +# endif + .p2align 4 +L(ExitZero): + RETURN + +END (STRCAT) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +# ifdef USE_AS_STRNCAT +L(ExitStrncatTable): + .int JMPTBL(L(StrncatExit0), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit1), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit2), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit3), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit4), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit5), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit6), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit7), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit8), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit9), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit10), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit11), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit12), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit13), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit14), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit15), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit16), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit17), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit18), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit19), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit20), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit21), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit22), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit23), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit24), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit25), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit26), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit27), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit28), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit29), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit30), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit31), L(ExitStrncatTable)) + .int JMPTBL(L(StrncatExit32), L(ExitStrncatTable)) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S new file mode 100644 index 0000000000..59ffbc60a5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat-ssse3.S @@ -0,0 +1,572 @@ +/* strcat with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCAT +# define STRCAT __strcat_ssse3 +# endif + +# define PARMS 4 +# define STR1 PARMS+4 +# define STR2 STR1+4 + +# ifdef USE_AS_STRNCAT +# define LEN STR2+8 +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) + PUSH (%edi) + mov STR1(%esp), %edi + mov %edi, %edx + +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-sse2.S" + +L(StartStrcpyPart): + mov STR2(%esp), %ecx + lea (%edi, %eax), %edx +# ifdef USE_AS_STRNCAT + PUSH (%ebx) + mov LEN(%esp), %ebx + test %ebx, %ebx + jz L(StrncatExit0) + cmp $8, %ebx + jbe L(StrncatExit8Bytes) +# endif + cmpb $0, (%ecx) + jz L(Exit1) + cmpb $0, 1(%ecx) + jz L(Exit2) + cmpb $0, 2(%ecx) + jz L(Exit3) + cmpb $0, 3(%ecx) + jz L(Exit4) + cmpb $0, 4(%ecx) + jz L(Exit5) + cmpb $0, 5(%ecx) + jz L(Exit6) + cmpb $0, 6(%ecx) + jz L(Exit7) + cmpb $0, 7(%ecx) + jz L(Exit8) + cmpb $0, 8(%ecx) + jz L(Exit9) +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + jb L(StrncatExit15Bytes) +# endif + cmpb $0, 9(%ecx) + jz L(Exit10) + cmpb $0, 10(%ecx) + jz L(Exit11) + cmpb $0, 11(%ecx) + jz L(Exit12) + cmpb $0, 12(%ecx) + jz L(Exit13) + cmpb $0, 13(%ecx) + jz L(Exit14) + cmpb $0, 14(%ecx) + jz L(Exit15) + cmpb $0, 15(%ecx) + jz L(Exit16) +# ifdef USE_AS_STRNCAT + cmp $16, %ebx + je L(StrncatExit16) + +# define RETURN1 \ + POP (%ebx); \ + POP (%edi); \ + ret; \ + CFI_PUSH (%ebx); \ + CFI_PUSH (%edi) +# define USE_AS_STRNCPY +# else +# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) +# endif +# include "strcpy-ssse3.S" + .p2align 4 +L(CopyFrom1To16Bytes): + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit1): + movb %bh, 1(%edx) +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit2): + movb %bh, 2(%edx) +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit3): + movb %bh, 3(%edx) +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit4): + movb %bh, 4(%edx) +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit5): + movb %bh, 5(%edx) +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit6): + movb %bh, 6(%edx) +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit7): + movb %bh, 7(%edx) +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit8): + movb %bh, 8(%edx) +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit9): + movb %bh, 9(%edx) +L(Exit9): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit10): + movb %bh, 10(%edx) +L(Exit10): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit11): + movb %bh, 11(%edx) +L(Exit11): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit12): + movb %bh, 12(%edx) +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit13): + movb %bh, 13(%edx) +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit14): + movb %bh, 14(%edx) +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit15): + movb %bh, 15(%edx) +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit16): + movb %bh, 16(%edx) +L(Exit16): + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + +# ifdef USE_AS_STRNCPY + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %esi, %ecx + lea (%esi, %edx), %esi + lea -9(%ebx), %edx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%esi), %edx + POP (%esi) + jz L(ExitHighCase2) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %ebx + je L(StrncatExit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %ebx + je L(StrncatExit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %ebx + je L(StrncatExit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %ebx + je L(StrncatExit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %ebx + je L(StrncatExit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %ebx + je L(StrncatExit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + xor %cl, %cl + movb %cl, (%eax) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHighCase2): + test $0x01, %ah + jnz L(Exit9) + cmp $9, %ebx + je L(StrncatExit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %ebx + je L(StrncatExit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %ebx + je L(StrncatExit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %ebx + je L(StrncatExit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %ebx + je L(StrncatExit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %ebx + je L(StrncatExit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %ebx + je L(StrncatExit15) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + movl %edi, %eax + RETURN1 + + CFI_PUSH(%esi) + +L(CopyFrom1To16BytesCase2OrCase3): + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %esi, %edx + add %esi, %ecx + + POP (%esi) + + cmp $8, %ebx + ja L(ExitHighCase3) + cmp $1, %ebx + je L(StrncatExit1) + cmp $2, %ebx + je L(StrncatExit2) + cmp $3, %ebx + je L(StrncatExit3) + cmp $4, %ebx + je L(StrncatExit4) + cmp $5, %ebx + je L(StrncatExit5) + cmp $6, %ebx + je L(StrncatExit6) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movb %bh, 8(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(ExitHighCase3): + cmp $9, %ebx + je L(StrncatExit9) + cmp $10, %ebx + je L(StrncatExit10) + cmp $11, %ebx + je L(StrncatExit11) + cmp $12, %ebx + je L(StrncatExit12) + cmp $13, %ebx + je L(StrncatExit13) + cmp $14, %ebx + je L(StrncatExit14) + cmp $15, %ebx + je L(StrncatExit15) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + movb %bh, 16(%edx) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit0): + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit15Bytes): + cmp $9, %ebx + je L(StrncatExit9) + cmpb $0, 9(%ecx) + jz L(Exit10) + cmp $10, %ebx + je L(StrncatExit10) + cmpb $0, 10(%ecx) + jz L(Exit11) + cmp $11, %ebx + je L(StrncatExit11) + cmpb $0, 11(%ecx) + jz L(Exit12) + cmp $12, %ebx + je L(StrncatExit12) + cmpb $0, 12(%ecx) + jz L(Exit13) + cmp $13, %ebx + je L(StrncatExit13) + cmpb $0, 13(%ecx) + jz L(Exit14) + cmp $14, %ebx + je L(StrncatExit14) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + lea 14(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + movb %bh, (%eax) + movl %edi, %eax + RETURN1 + + .p2align 4 +L(StrncatExit8Bytes): + cmpb $0, (%ecx) + jz L(Exit1) + cmp $1, %ebx + je L(StrncatExit1) + cmpb $0, 1(%ecx) + jz L(Exit2) + cmp $2, %ebx + je L(StrncatExit2) + cmpb $0, 2(%ecx) + jz L(Exit3) + cmp $3, %ebx + je L(StrncatExit3) + cmpb $0, 3(%ecx) + jz L(Exit4) + cmp $4, %ebx + je L(StrncatExit4) + cmpb $0, 4(%ecx) + jz L(Exit5) + cmp $5, %ebx + je L(StrncatExit5) + cmpb $0, 5(%ecx) + jz L(Exit6) + cmp $6, %ebx + je L(StrncatExit6) + cmpb $0, 6(%ecx) + jz L(Exit7) + cmp $7, %ebx + je L(StrncatExit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax + movb %bh, (%eax) + movl %edi, %eax + RETURN1 + +# endif +END (STRCAT) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S new file mode 100644 index 0000000000..8412cb6f23 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcat.S @@ -0,0 +1,92 @@ +/* Multiple versions of strcat + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifndef USE_AS_STRNCAT +# ifndef STRCAT +# define STRCAT strcat +# endif +#endif + +#ifdef USE_AS_STRNCAT +# define STRCAT_SSSE3 __strncat_ssse3 +# define STRCAT_SSE2 __strncat_sse2 +# define STRCAT_IA32 __strncat_ia32 +# define __GI_STRCAT __GI_strncat +#else +# define STRCAT_SSSE3 __strcat_ssse3 +# define STRCAT_SSE2 __strcat_sse2 +# define STRCAT_IA32 __strcat_ia32 +# define __GI_STRCAT __GI_strcat +#endif + + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncat in static library since we + need strncat before the initialization happened. */ +#if IS_IN (libc) + + .text +ENTRY(STRCAT) + .type STRCAT, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (STRCAT_IA32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (STRCAT_SSE2) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (STRCAT_SSSE3) +2: ret +END(STRCAT) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCAT_IA32, @function; \ + .align 16; \ + .globl STRCAT_IA32; \ + .hidden STRCAT_IA32; \ + STRCAT_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCAT_IA32, .-STRCAT_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcat calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCAT; __GI_STRCAT = STRCAT_IA32 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCAT; __GI___STRCAT = STRCAT_IA32 + +# endif +#endif + +#ifndef USE_AS_STRNCAT +# include "../../strcat.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S new file mode 100644 index 0000000000..95fd7c084e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S @@ -0,0 +1,158 @@ +/* strchr with SSE2 with bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi) +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + .text +ENTRY (__strchr_sse2_bsf) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $15, %ecx + pshufd $0, %xmm1, %xmm1 + je L(loop) + +/* Handle unaligned string. */ + and $-16, %edi + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + je L(unaligned_no_match) + /* Check which byte is a match. */ + bsf %eax, %eax + /* Is there a NULL? */ + test %edx, %edx + je L(unaligned_match) + bsf %edx, %edx + cmpl %edx, %eax + /* Return NULL if NULL comes first. */ + ja L(return_null) +L(unaligned_match): + add %edi, %eax + add %ecx, %eax + RETURN + + .p2align 4 +L(unaligned_no_match): + test %edx, %edx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %edi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + jmp L(loop) + +L(matches): + pmovmskb %xmm2, %edx + test %eax, %eax + jz L(return_null) + bsf %eax, %eax + /* There is a match. First find where NULL is. */ + test %edx, %edx + je L(match) + bsf %edx, %ecx + /* Check if NULL comes first. */ + cmpl %ecx, %eax + ja L(return_null) +L(match): + sub $16, %edi + add %edi, %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + +END (__strchr_sse2_bsf) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S new file mode 100644 index 0000000000..1f9e875b04 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr-sse2.S @@ -0,0 +1,348 @@ +/* strchr SSE2 without bsf + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi) +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__strchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $15, %ecx + pshufd $0, %xmm1, %xmm1 + je L(loop) + +/* Handle unaligned string. */ + and $-16, %edi + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + jz L(unaligned_no_match) + /* Check which byte is a match. */ + /* Is there a NULL? */ + add %ecx, %edi + test %edx, %edx + jz L(match_case1) + jmp L(match_case2) + + .p2align 4 +L(unaligned_no_match): + test %edx, %edx + jne L(return_null) + + pxor %xmm2, %xmm2 + add $16, %edi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches) + test %edx, %edx + jnz L(return_null) + add $16, %edi + jmp L(loop) + +L(matches): + /* There is a match. First find where NULL is. */ + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_higth_case2) + + mov %al, %cl + and $15, %cl + jnz L(match_case2_4) + + mov %dl, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %al + jnz L(Exit5) + test $0x10, %dl + jnz L(return_null) + test $0x20, %al + jnz L(Exit6) + test $0x20, %dl + jnz L(return_null) + test $0x40, %al + jnz L(Exit7) + test $0x40, %dl + jnz L(return_null) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_4): + test $0x01, %al + jnz L(Exit1) + test $0x01, %dl + jnz L(return_null) + test $0x02, %al + jnz L(Exit2) + test $0x02, %dl + jnz L(return_null) + test $0x04, %al + jnz L(Exit3) + test $0x04, %dl + jnz L(return_null) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_higth_case2): + test %dl, %dl + jnz L(return_null) + + mov %ah, %cl + and $15, %cl + jnz L(match_case2_12) + + mov %dh, %ch + and $15, %ch + jnz L(return_null) + + test $0x10, %ah + jnz L(Exit13) + test $0x10, %dh + jnz L(return_null) + test $0x20, %ah + jnz L(Exit14) + test $0x20, %dh + jnz L(return_null) + test $0x40, %ah + jnz L(Exit15) + test $0x40, %dh + jnz L(return_null) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_12): + test $0x01, %ah + jnz L(Exit9) + test $0x01, %dh + jnz L(return_null) + test $0x02, %ah + jnz L(Exit10) + test $0x02, %dh + jnz L(return_null) + test $0x04, %ah + jnz L(Exit11) + test $0x04, %dh + jnz L(return_null) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_higth_case1) + + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_higth_case1): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(Exit1): + lea (%edi), %eax + RETURN + + .p2align 4 +L(Exit2): + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(Exit3): + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(Exit4): + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(Exit5): + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(Exit6): + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(Exit7): + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(Exit9): + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(Exit10): + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(Exit11): + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(Exit12): + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(Exit13): + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(Exit14): + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(Exit15): + lea 14(%edi), %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + +END (__strchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S new file mode 100644 index 0000000000..5b97b1c767 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strchr.S @@ -0,0 +1,57 @@ +/* Multiple versions of strchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(strchr) + .type strchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strchr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf) + HAS_ARCH_FEATURE (Slow_BSF) + jz 2f + LOAD_FUNC_GOT_EAX (__strchr_sse2) +2: ret +END(strchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __strchr_ia32, @function; \ + .globl __strchr_ia32; \ + .p2align 4; \ + __strchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strchr_ia32, .-__strchr_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strchr; __GI_strchr = __strchr_ia32 +#endif + +#include "../../i586/strchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S new file mode 100644 index 0000000000..cd26058671 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -0,0 +1,804 @@ +/* strcmp with SSE4.2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strncmp_sse4_2 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 +# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM) +# define REM %ebp +#elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strcasecmp_l_sse4_2 +# endif +# ifdef PIC +# define STR1 12 +# else +# define STR1 8 +# endif +# define STR2 STR1+4 +# define LOCALE 12 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (%edi); POP (%ebx); ret; \ + .p2align 4; CFI_PUSH (%ebx); CFI_PUSH (%edi) +# else +# define RETURN POP (%edi); ret; .p2align 4; CFI_PUSH (%edi) +# endif +# define NONASCII __strcasecmp_nonascii +#elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strncasecmp_l_sse4_2 +# endif +# ifdef PIC +# define STR1 16 +# else +# define STR1 12 +# endif +# define STR2 STR1+4 +# define CNT STR2+4 +# define LOCALE 16 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (%edi); POP (REM); POP (%ebx); ret; \ + .p2align 4; \ + CFI_PUSH (%ebx); CFI_PUSH (REM); CFI_PUSH (%edi) +# else +# define RETURN POP (%edi); POP (REM); ret; \ + .p2align 4; CFI_PUSH (REM); CFI_PUSH (%edi) +# endif +# define REM %ebp +# define NONASCII __strncasecmp_nonascii +#else +# ifndef STRCMP +# define STRCMP __strcmp_sse4_2 +# endif +# define STR1 4 +# define STR2 STR1+4 +# define RETURN ret; .p2align 4 +#endif + + .section .text.sse4.2,"ax",@progbits + +#ifdef USE_AS_STRCASECMP_L +ENTRY (__strcasecmp_sse4_2) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strcasecmp_nonascii +# else + jne __strcasecmp_nonascii + jmp L(ascii) +# endif +END (__strcasecmp_sse4_2) +#endif + +#ifdef USE_AS_STRNCASECMP_L +ENTRY (__strncasecmp_sse4_2) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strncasecmp_nonascii +# else + jne __strncasecmp_nonascii + jmp L(ascii) +# endif +END (__strncasecmp_sse4_2) +#endif + + ENTRY (STRCMP) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movl LOCALE(%esp), %eax +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) + jne NONASCII + +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) +# endif +L(ascii): + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.Lbelowupper: + .quad 0x4040404040404040 + .quad 0x4040404040404040 +.Ltopupper: + .quad 0x5b5b5b5b5b5b5b5b + .quad 0x5b5b5b5b5b5b5b5b +.Ltouppermask: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + +# ifdef PIC +# define UCLOW_reg .Lbelowupper@GOTOFF(%ebx) +# define UCHIGH_reg .Ltopupper@GOTOFF(%ebx) +# define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx) +# else +# define UCLOW_reg .Lbelowupper +# define UCHIGH_reg .Ltopupper +# define LCQWORD_reg .Ltouppermask +# endif +#endif + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + PUSH (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + PUSH (%edi) +#endif + mov STR1(%esp), %edx + mov STR2(%esp), %eax +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + movl CNT(%esp), REM + test REM, REM + je L(eq) +#endif + mov %dx, %cx + and $0xfff, %cx + cmp $0xff0, %cx + ja L(first4bytes) + movdqu (%edx), %xmm2 + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(first4bytes) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa reg1, %xmm3; \ + movdqa UCHIGH_reg, %xmm4; \ + movdqa reg2, %xmm5; \ + movdqa UCHIGH_reg, %xmm6; \ + pcmpgtb UCLOW_reg, %xmm3; \ + pcmpgtb reg1, %xmm4; \ + pcmpgtb UCLOW_reg, %xmm5; \ + pcmpgtb reg2, %xmm6; \ + pand %xmm4, %xmm3; \ + pand %xmm6, %xmm5; \ + pand LCQWORD_reg, %xmm3; \ + pand LCQWORD_reg, %xmm5; \ + por %xmm3, reg1; \ + por %xmm5, reg2 + + movdqu (%eax), %xmm1 + TOLOWER (%xmm2, %xmm1) + movd %xmm2, %ecx + movd %xmm1, %edi + movdqa %xmm2, %xmm3 + movdqa %xmm1, %xmm4 + cmpl %edi, %ecx +#else +# define TOLOWER(reg1, reg) + + movd %xmm2, %ecx + cmp (%eax), %ecx +#endif + jne L(less4bytes) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + movdqu (%eax), %xmm1 +#endif + pxor %xmm2, %xmm1 + pxor %xmm0, %xmm0 + ptest %xmm1, %xmm0 + jnc L(less16bytes) + pcmpeqb %xmm0, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, REM + jbe L(eq) +#endif + add $16, %edx + add $16, %eax +L(first4bytes): + movzbl (%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl (%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, (%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + je L(eq) +#endif + + movzbl 1(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 1(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 1(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + je L(eq) +#endif + movzbl 2(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 2(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 2(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + je L(eq) +#endif + movzbl 3(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 3(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 3(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + je L(eq) +#endif + movzbl 4(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 4(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 4(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + je L(eq) +#endif + movzbl 5(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 5(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 5(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + je L(eq) +#endif + movzbl 6(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 6(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 6(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + je L(eq) +#endif + movzbl 7(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 7(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 7(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $8, REM + je L(eq) +#endif + add $8, %eax + add $8, %edx + +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + PUSH (%edi) +#endif + PUSH (%esi) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cfi_remember_state +#endif + mov %edx, %edi + mov %eax, %esi + xorl %eax, %eax +L(check_offset): + movl %edi, %edx + movl %esi, %ecx + andl $0xfff, %edx + andl $0xfff, %ecx + cmpl %edx, %ecx + cmovl %edx, %ecx + lea -0xff0(%ecx), %edx + sub %edx, %edi + sub %edx, %esi + testl %edx, %edx + jg L(crosspage) +L(loop): + movdqu (%esi,%edx), %xmm2 + movdqu (%edi,%edx), %xmm1 + TOLOWER (%xmm2, %xmm1) + pcmpistri $0x1a, %xmm2, %xmm1 + jbe L(end) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, REM + jbe L(more16byteseq) +#endif + + add $16, %edx + jle L(loop) +L(crosspage): + movzbl (%edi,%edx), %eax + movzbl (%esi,%edx), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx +# endif +#endif + subl %ecx, %eax + jne L(ret) + testl %ecx, %ecx + je L(ret) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $1, REM + jbe L(more16byteseq) +#endif + inc %edx + cmp $15, %edx + jle L(crosspage) + add %edx, %edi + add %edx, %esi + jmp L(check_offset) + + .p2align 4 +L(end): + jnc L(ret) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub %ecx, REM + jbe L(more16byteseq) +#endif + lea (%ecx,%edx), %ecx + movzbl (%edi,%ecx), %eax + movzbl (%esi,%ecx), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx +# endif +#endif + subl %ecx, %eax +L(ret): + POP (%esi) + POP (%edi) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + POP (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + POP (%ebx) +# endif +#endif + ret + + .p2align 4 +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cfi_restore_state +L(more16byteseq): + POP (%esi) +# ifdef USE_AS_STRNCMP + POP (%edi) +# endif +#endif +L(eq): + xorl %eax, %eax + RETURN + +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): + RETURN + +L(less16bytes): + add $0xfefefeff, %ecx + jnc L(less4bytes) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movd %xmm3, %edi + xor %edi, %ecx +#else + xor (%edx), %ecx +#endif + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(less4bytes) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + jbe L(eq) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + psrldq $4, %xmm3 + psrldq $4, %xmm4 + movd %xmm3, %ecx + movd %xmm4, %edi + cmp %edi, %ecx + mov %ecx, %edi +#else + mov 4(%edx), %ecx + cmp 4(%eax), %ecx +#endif + jne L(more4bytes) + add $0xfefefeff, %ecx + jnc L(more4bytes) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + xor %edi, %ecx +#else + xor 4(%edx), %ecx +#endif + or $0xfefefeff, %ecx + add $1, %ecx + jnz L(more4bytes) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $8, REM + jbe L(eq) +#endif + + add $8, %edx + add $8, %eax +L(less4bytes): + + movzbl (%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl (%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, (%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + je L(eq) +#endif + movzbl 1(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 1(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 1(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + je L(eq) +#endif + + movzbl 2(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 2(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 2(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + je L(eq) +#endif + movzbl 3(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 3(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 3(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +L(more4bytes): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + je L(eq) +#endif + movzbl 4(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 4(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 4(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + je L(eq) +#endif + movzbl 5(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 5(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 5(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + je L(eq) +#endif + movzbl 6(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 6(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 6(%edx) +#endif + jne L(neq) + cmpl $0, %ecx + je L(eq) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + je L(eq) +#endif + movzbl 7(%eax), %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movzbl 7(%edx), %edi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%edi,4), %edi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%edi,4), %edi +# endif + cmpl %ecx, %edi +#else + cmpb %cl, 7(%edx) +#endif + jne L(neq) + jmp L(eq) + +END (STRCMP) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S new file mode 100644 index 0000000000..b25cc3e068 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp-ssse3.S @@ -0,0 +1,2810 @@ +/* strcmp with SSSE3 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +#include <sysdep.h> +#include "asm-syntax.h" + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#ifdef USE_AS_STRNCMP +# ifndef STRCMP +# define STRCMP __strncmp_ssse3 +# endif +# define STR1 8 +# define STR2 STR1+4 +# define CNT STR2+4 +# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM) +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + mov $16, %esi; \ + sub %ecx, %esi; \ + cmp %esi, REM; \ + jbe L(more8byteseq); \ + sub %esi, REM +# define FLAGS %ebx +# define REM %ebp +#elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strcasecmp_l_ssse3 +# endif +# ifdef PIC +# define STR1 8 +# else +# define STR1 4 +# endif +# define STR2 STR1+4 +# define LOCALE 12 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (%ebx); ret; .p2align 4; CFI_PUSH (%ebx) +# else +# define RETURN ret; .p2align 4 +# endif +# define UPDATE_STRNCMP_COUNTER +# define FLAGS (%esp) +# define NONASCII __strcasecmp_nonascii +#elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" +# ifndef STRCMP +# define STRCMP __strncasecmp_l_ssse3 +# endif +# ifdef PIC +# define STR1 12 +# else +# define STR1 8 +# endif +# define STR2 STR1+4 +# define CNT STR2+4 +# define LOCALE 16 /* Loaded before the adjustment. */ +# ifdef PIC +# define RETURN POP (REM); POP (%ebx); ret; \ + .p2align 4; CFI_PUSH (%ebx); CFI_PUSH (REM) +# else +# define RETURN POP (REM); ret; .p2align 4; CFI_PUSH (REM) +# endif +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + mov $16, %esi; \ + sub %ecx, %esi; \ + cmp %esi, REM; \ + jbe L(more8byteseq); \ + sub %esi, REM +# define FLAGS (%esp) +# define REM %ebp +# define NONASCII __strncasecmp_nonascii +#else +# ifndef STRCMP +# define STRCMP __strcmp_ssse3 +# endif +# define STR1 4 +# define STR2 STR1+4 +# define RETURN ret; .p2align 4 +# define UPDATE_STRNCMP_COUNTER +# define FLAGS %ebx +#endif + + .section .text.ssse3,"ax",@progbits + +#ifdef USE_AS_STRCASECMP_L +ENTRY (__strcasecmp_ssse3) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strcasecmp_nonascii +# else + jne __strcasecmp_nonascii + jmp L(ascii) +# endif +END (__strcasecmp_ssse3) +#endif + +#ifdef USE_AS_STRNCASECMP_L +ENTRY (__strncasecmp_ssse3) +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) + movl __libc_tsd_LOCALE@GOTNTPOFF(%ebx), %eax +# ifdef NO_TLS_DIRECT_SEG_REFS + addl %gs:0, %eax + movl (%eax), %eax +# else + movl %gs:(%eax), %eax +# endif +# else +# ifdef NO_TLS_DIRECT_SEG_REFS + movl %gs:0, %eax + movl __libc_tsd_LOCALE@NTPOFF(%eax), %eax +# else + movl %gs:__libc_tsd_LOCALE@NTPOFF, %eax +# endif +# endif +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) +# ifdef PIC + je L(ascii) + POP (%ebx) + jmp __strncasecmp_nonascii +# else + jne __strncasecmp_nonascii + jmp L(ascii) +# endif +END (__strncasecmp_ssse3) +#endif + +ENTRY (STRCMP) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movl LOCALE(%esp), %eax +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + movl LOCALE_T___LOCALES+LC_CTYPE*4(%eax), %eax +# else + movl (%eax), %eax +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%eax) + jne NONASCII + +# ifdef PIC + PUSH (%ebx) + LOAD_PIC_REG(bx) +# endif +L(ascii): + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.Lbelowupper: + .quad 0x4040404040404040 + .quad 0x4040404040404040 +.Ltopupper: + .quad 0x5b5b5b5b5b5b5b5b + .quad 0x5b5b5b5b5b5b5b5b +.Ltouppermask: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + +# ifdef PIC +# define UCLOW_reg .Lbelowupper@GOTOFF(%ebx) +# define UCHIGH_reg .Ltopupper@GOTOFF(%ebx) +# define LCQWORD_reg .Ltouppermask@GOTOFF(%ebx) +# else +# define UCLOW_reg .Lbelowupper +# define UCHIGH_reg .Ltopupper +# define LCQWORD_reg .Ltouppermask +# endif +#endif + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + PUSH (REM) +#endif + + movl STR1(%esp), %edx + movl STR2(%esp), %eax +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + movl CNT(%esp), REM + cmp $16, REM + jb L(less16bytes_sncmp) +#elif !defined USE_AS_STRCASECMP_L + movzbl (%eax), %ecx + cmpb %cl, (%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 1(%eax), %ecx + cmpb %cl, 1(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 2(%eax), %ecx + cmpb %cl, 2(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 3(%eax), %ecx + cmpb %cl, 3(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 4(%eax), %ecx + cmpb %cl, 4(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 5(%eax), %ecx + cmpb %cl, 5(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 6(%eax), %ecx + cmpb %cl, 6(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + movzbl 7(%eax), %ecx + cmpb %cl, 7(%edx) + jne L(neq) + cmpl $0, %ecx + je L(eq) + + add $8, %edx + add $8, %eax +#endif + movl %edx, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(crosspage) + mov %eax, %ecx + and $0xfff, %ecx + cmp $0xff0, %ecx + ja L(crosspage) + pxor %xmm0, %xmm0 + movlpd (%eax), %xmm1 + movlpd (%edx), %xmm2 + movhpd 8(%eax), %xmm1 + movhpd 8(%edx), %xmm2 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa reg1, %xmm5; \ + movdqa reg2, %xmm7; \ + movdqa UCHIGH_reg, %xmm6; \ + pcmpgtb UCLOW_reg, %xmm5; \ + pcmpgtb UCLOW_reg, %xmm7; \ + pcmpgtb reg1, %xmm6; \ + pand %xmm6, %xmm5; \ + movdqa UCHIGH_reg, %xmm6; \ + pcmpgtb reg2, %xmm6; \ + pand %xmm6, %xmm7; \ + pand LCQWORD_reg, %xmm5; \ + por %xmm5, reg1; \ + pand LCQWORD_reg, %xmm7; \ + por %xmm7, reg2 + TOLOWER (%xmm1, %xmm2) +#else +# define TOLOWER(reg1, reg2) +#endif + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %ecx + sub $0xffff, %ecx + jnz L(less16bytes) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(eq) +#endif + add $16, %eax + add $16, %edx + +L(crosspage): + +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + PUSH (FLAGS) +#endif + PUSH (%edi) + PUSH (%esi) +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + pushl $0 + cfi_adjust_cfa_offset (4) +#endif +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cfi_remember_state +#endif + + movl %edx, %edi + movl %eax, %ecx + and $0xf, %ecx + and $0xf, %edi + xor %ecx, %eax + xor %edi, %edx +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + xor FLAGS, FLAGS +#endif + cmp %edi, %ecx + je L(ashr_0) + ja L(bigger) + orl $0x20, FLAGS + xchg %edx, %eax + xchg %ecx, %edi +L(bigger): + lea 15(%edi), %edi + sub %ecx, %edi + cmp $8, %edi + jle L(ashr_less_8) + cmp $14, %edi + je L(ashr_15) + cmp $13, %edi + je L(ashr_14) + cmp $12, %edi + je L(ashr_13) + cmp $11, %edi + je L(ashr_12) + cmp $10, %edi + je L(ashr_11) + cmp $9, %edi + je L(ashr_10) +L(ashr_less_8): + je L(ashr_9) + cmp $7, %edi + je L(ashr_8) + cmp $6, %edi + je L(ashr_7) + cmp $5, %edi + je L(ashr_6) + cmp $4, %edi + je L(ashr_5) + cmp $3, %edi + je L(ashr_4) + cmp $2, %edi + je L(ashr_3) + cmp $1, %edi + je L(ashr_2) + cmp $0, %edi + je L(ashr_1) + +/* + * The following cases will be handled by ashr_0 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +L(ashr_0): + mov $0xffff, %esi + movdqa (%eax), %xmm1 + pxor %xmm0, %xmm0 + pcmpeqb %xmm1, %xmm0 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movdqa (%edx), %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm2, %xmm1 +#else + pcmpeqb (%edx), %xmm1 +#endif + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + mov %ecx, %edi + jne L(less32bytes) + UPDATE_STRNCMP_COUNTER + movl $0x10, FLAGS + mov $0x10, %ecx + pxor %xmm0, %xmm0 + .p2align 4 +L(loop_ashr_0): + movdqa (%eax, %ecx), %xmm1 +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + movdqa (%edx, %ecx), %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 +#else + pcmpeqb %xmm1, %xmm0 + pcmpeqb (%edx, %ecx), %xmm1 +#endif + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + jmp L(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +L(ashr_1): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $15, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -15(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $1, FLAGS + lea 1(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_1): + add $16, %edi + jg L(nibble_ashr_1) + +L(gobble_ashr_1): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $1, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_1) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $1, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_1) + + .p2align 4 +L(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfffe, %esi + jnz L(ashr_1_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $15, REM + jbe L(ashr_1_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_1) + + .p2align 4 +L(ashr_1_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_2 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +L(ashr_2): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -14(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $2, FLAGS + lea 2(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_2): + add $16, %edi + jg L(nibble_ashr_2) + +L(gobble_ashr_2): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_2) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_2) + + .p2align 4 +L(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfffc, %esi + jnz L(ashr_2_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $14, REM + jbe L(ashr_2_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_2) + + .p2align 4 +L(ashr_2_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_3 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +L(ashr_3): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -13(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $3, FLAGS + lea 3(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_3): + add $16, %edi + jg L(nibble_ashr_3) + +L(gobble_ashr_3): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_3) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_3) + + .p2align 4 +L(nibble_ashr_3): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfff8, %esi + jnz L(ashr_3_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $13, REM + jbe L(ashr_3_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_3) + + .p2align 4 +L(ashr_3_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $3, %xmm0 + psrldq $3, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_4 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +L(ashr_4): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -12(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $4, FLAGS + lea 4(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_4): + add $16, %edi + jg L(nibble_ashr_4) + +L(gobble_ashr_4): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_4) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_4) + + .p2align 4 +L(nibble_ashr_4): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfff0, %esi + jnz L(ashr_4_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $12, REM + jbe L(ashr_4_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_4) + + .p2align 4 +L(ashr_4_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_5 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(11~15) n -11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +L(ashr_5): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -11(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $5, FLAGS + lea 5(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_5): + add $16, %edi + jg L(nibble_ashr_5) + +L(gobble_ashr_5): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_5) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_5) + + .p2align 4 +L(nibble_ashr_5): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xffe0, %esi + jnz L(ashr_5_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $11, REM + jbe L(ashr_5_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_5) + + .p2align 4 +L(ashr_5_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $5, %xmm0 + psrldq $5, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_6 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(10~15) n -10 5(15 +(n-10) - n) ashr_6 + */ + + .p2align 4 +L(ashr_6): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -10(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $6, FLAGS + lea 6(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_6): + add $16, %edi + jg L(nibble_ashr_6) + +L(gobble_ashr_6): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_6) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_6) + + .p2align 4 +L(nibble_ashr_6): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xffc0, %esi + jnz L(ashr_6_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $10, REM + jbe L(ashr_6_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_6) + + .p2align 4 +L(ashr_6_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $6, %xmm0 + psrldq $6, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_7 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n-9) - n) ashr_7 + */ + + .p2align 4 +L(ashr_7): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -9(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $7, FLAGS + lea 8(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_7): + add $16, %edi + jg L(nibble_ashr_7) + +L(gobble_ashr_7): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_7) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_7) + + .p2align 4 +L(nibble_ashr_7): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xff80, %esi + jnz L(ashr_7_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $9, REM + jbe L(ashr_7_exittail) +#endif + pxor %xmm0, %xmm0 + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_7) + + .p2align 4 +L(ashr_7_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $7, %xmm0 + psrldq $7, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_8 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n-8) - n) ashr_8 + */ + .p2align 4 +L(ashr_8): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -8(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $8, FLAGS + lea 8(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_8): + add $16, %edi + jg L(nibble_ashr_8) + +L(gobble_ashr_8): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_8) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_8) + + .p2align 4 +L(nibble_ashr_8): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xff00, %esi + jnz L(ashr_8_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $8, REM + jbe L(ashr_8_exittail) +#endif + pxor %xmm0, %xmm0 + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_8) + + .p2align 4 +L(ashr_8_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $8, %xmm0 + psrldq $8, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_9 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n-7) - n) ashr_9 + */ + .p2align 4 +L(ashr_9): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -7(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $9, FLAGS + lea 9(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_9): + add $16, %edi + jg L(nibble_ashr_9) + +L(gobble_ashr_9): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_9) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_9) + + .p2align 4 +L(nibble_ashr_9): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfe00, %esi + jnz L(ashr_9_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + jbe L(ashr_9_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_9) + + .p2align 4 +L(ashr_9_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $9, %xmm0 + psrldq $9, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_10 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n-6) - n) ashr_10 + */ + .p2align 4 +L(ashr_10): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -6(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $10, FLAGS + lea 10(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_10): + add $16, %edi + jg L(nibble_ashr_10) + +L(gobble_ashr_10): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_10) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_10) + + .p2align 4 +L(nibble_ashr_10): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xfc00, %esi + jnz L(ashr_10_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + jbe L(ashr_10_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_10) + + .p2align 4 +L(ashr_10_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $10, %xmm0 + psrldq $10, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_11 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n-5) - n) ashr_11 + */ + .p2align 4 +L(ashr_11): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -5(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $11, FLAGS + lea 11(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_11): + add $16, %edi + jg L(nibble_ashr_11) + +L(gobble_ashr_11): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_11) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_11) + + .p2align 4 +L(nibble_ashr_11): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xf800, %esi + jnz L(ashr_11_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + jbe L(ashr_11_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_11) + + .p2align 4 +L(ashr_11_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $11, %xmm0 + psrldq $11, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_12 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n-4) - n) ashr_12 + */ + .p2align 4 +L(ashr_12): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -4(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $12, FLAGS + lea 12(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_12): + add $16, %edi + jg L(nibble_ashr_12) + +L(gobble_ashr_12): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_12) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_12) + + .p2align 4 +L(nibble_ashr_12): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xf000, %esi + jnz L(ashr_12_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + jbe L(ashr_12_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_12) + + .p2align 4 +L(ashr_12_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $12, %xmm0 + psrldq $12, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_13 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n-3) - n) ashr_13 + */ + .p2align 4 +L(ashr_13): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -3(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $13, FLAGS + lea 13(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_13): + add $16, %edi + jg L(nibble_ashr_13) + +L(gobble_ashr_13): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_13) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_13) + + .p2align 4 +L(nibble_ashr_13): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xe000, %esi + jnz L(ashr_13_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + jbe L(ashr_13_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_13) + + .p2align 4 +L(ashr_13_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $13, %xmm0 + psrldq $13, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n-2) - n) ashr_14 + */ + .p2align 4 +L(ashr_14): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -2(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $14, FLAGS + lea 14(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_14): + add $16, %edi + jg L(nibble_ashr_14) + +L(gobble_ashr_14): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_14) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_14) + + .p2align 4 +L(nibble_ashr_14): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0xc000, %esi + jnz L(ashr_14_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + jbe L(ashr_14_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_14) + + .p2align 4 +L(ashr_14_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $14, %xmm0 + psrldq $14, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * ecx(offset of esi) eax(offset of edi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n-1) - n) ashr_15 + */ + + .p2align 4 +L(ashr_15): + mov $0xffff, %esi + pxor %xmm0, %xmm0 + movdqa (%edx), %xmm2 + movdqa (%eax), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %edi + shr %cl, %esi + shr %cl, %edi + sub %edi, %esi + lea -1(%ecx), %edi + jnz L(less32bytes) + + UPDATE_STRNCMP_COUNTER + + movdqa (%edx), %xmm3 + pxor %xmm0, %xmm0 + mov $16, %ecx + orl $15, FLAGS + lea 15(%edx), %edi + and $0xfff, %edi + sub $0x1000, %edi + + .p2align 4 +L(loop_ashr_15): + add $16, %edi + jg L(nibble_ashr_15) + +L(gobble_ashr_15): + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + + add $16, %edi + jg L(nibble_ashr_15) + + movdqa (%eax, %ecx), %xmm1 + movdqa (%edx, %ecx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + sub $0xffff, %esi + jnz L(exit) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $16, REM + lea -16(REM), REM + jbe L(more8byteseq) +#endif + add $16, %ecx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_15) + + .p2align 4 +L(nibble_ashr_15): + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %esi + test $0x8000, %esi + jnz L(ashr_15_exittail) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + jbe L(ashr_15_exittail) +#endif + pxor %xmm0, %xmm0 + sub $0x1000, %edi + jmp L(gobble_ashr_15) + + .p2align 4 +L(ashr_15_exittail): + movdqa (%eax, %ecx), %xmm1 + psrldq $15, %xmm0 + psrldq $15, %xmm3 + jmp L(aftertail) + + .p2align 4 +L(aftertail): + TOLOWER (%xmm1, %xmm3) + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %esi + not %esi +L(exit): + mov FLAGS, %edi + and $0x1f, %edi + lea -16(%edi, %ecx), %edi +L(less32bytes): + add %edi, %edx + add %ecx, %eax + testl $0x20, FLAGS + jz L(ret2) + xchg %eax, %edx + + .p2align 4 +L(ret2): + mov %esi, %ecx +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + addl $4, %esp + cfi_adjust_cfa_offset (-4) +#endif + POP (%esi) + POP (%edi) +#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + POP (FLAGS) +#endif +L(less16bytes): + test %cl, %cl + jz L(2next_8_bytes) + + test $0x01, %cl + jnz L(Byte0) + + test $0x02, %cl + jnz L(Byte1) + + test $0x04, %cl + jnz L(Byte2) + + test $0x08, %cl + jnz L(Byte3) + + test $0x10, %cl + jnz L(Byte4) + + test $0x20, %cl + jnz L(Byte5) + + test $0x40, %cl + jnz L(Byte6) +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + jbe L(eq) +#endif + + movzx 7(%eax), %ecx + movzx 7(%edx), %eax +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte0): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $0, REM + jbe L(eq) +#endif + movzx (%eax), %ecx + movzx (%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte1): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $1, REM + jbe L(eq) +#endif + movzx 1(%eax), %ecx + movzx 1(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte2): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $2, REM + jbe L(eq) +#endif + movzx 2(%eax), %ecx + movzx 2(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte3): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $3, REM + jbe L(eq) +#endif + movzx 3(%eax), %ecx + movzx 3(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte4): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $4, REM + jbe L(eq) +#endif + movzx 4(%eax), %ecx + movzx 4(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte5): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $5, REM + jbe L(eq) +#endif + movzx 5(%eax), %ecx + movzx 5(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(Byte6): +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $6, REM + jbe L(eq) +#endif + movzx 6(%eax), %ecx + movzx 6(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +L(2next_8_bytes): + add $8, %eax + add $8, %edx +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $8, REM + lea -8(REM), REM + jbe L(eq) +#endif + + test $0x01, %ch + jnz L(Byte0) + + test $0x02, %ch + jnz L(Byte1) + + test $0x04, %ch + jnz L(Byte2) + + test $0x08, %ch + jnz L(Byte3) + + test $0x10, %ch + jnz L(Byte4) + + test $0x20, %ch + jnz L(Byte5) + + test $0x40, %ch + jnz L(Byte6) + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $7, REM + jbe L(eq) +#endif + movzx 7(%eax), %ecx + movzx 7(%edx), %eax + +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%eax,4), %eax +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%eax,4), %eax +# endif +#endif + + sub %ecx, %eax + RETURN + +#ifdef USE_AS_STRNCMP +L(neq_sncmp): +#endif +L(neq): + mov $1, %eax + ja L(neq_bigger) + neg %eax +L(neq_bigger): +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + addl $4, %esp + cfi_adjust_cfa_offset (-4) +#endif +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + POP (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + POP (%ebx) +# endif +#endif + ret + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + .p2align 4 + cfi_restore_state +L(more8byteseq): + +# ifdef USE_AS_STRNCASECMP_L + addl $4, %esp + cfi_adjust_cfa_offset (-4) +# endif + POP (%esi) + POP (%edi) +# ifdef USE_AS_STRNCMP + POP (FLAGS) +# endif +#endif + +#ifdef USE_AS_STRNCMP +L(eq_sncmp): +#endif +L(eq): + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + POP (REM) +#endif +#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# ifdef PIC + POP (%ebx) +# endif +#endif + xorl %eax, %eax + ret + +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + .p2align 4 +# if defined USE_AS_STRNCASECMP_L && defined PIC + CFI_PUSH (%ebx) +# endif + CFI_PUSH (REM) +L(less16bytes_sncmp): +# ifdef USE_AS_STRNCASECMP_L + PUSH (%esi) +# endif + test REM, REM + jz L(eq_sncmp) + + movzbl (%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl (%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, (%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $1, REM + je L(eq_sncmp) + + movzbl 1(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 1(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 1(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $2, REM + je L(eq_sncmp) + + movzbl 2(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 2(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 2(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $3, REM + je L(eq_sncmp) + + movzbl 3(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 3(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 3(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $4, REM + je L(eq_sncmp) + + movzbl 4(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 4(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 4(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $5, REM + je L(eq_sncmp) + + movzbl 5(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 5(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 5(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $6, REM + je L(eq_sncmp) + + movzbl 6(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 6(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 6(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $7, REM + je L(eq_sncmp) + + movzbl 7(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 7(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 7(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + + cmp $8, REM + je L(eq_sncmp) + + movzbl 8(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 8(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 8(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $9, REM + je L(eq_sncmp) + + movzbl 9(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 9(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 9(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $10, REM + je L(eq_sncmp) + + movzbl 10(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 10(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 10(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $11, REM + je L(eq_sncmp) + + movzbl 11(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 11(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 11(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + + cmp $12, REM + je L(eq_sncmp) + + movzbl 12(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 12(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 12(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $13, REM + je L(eq_sncmp) + + movzbl 13(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 13(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 13(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $14, REM + je L(eq_sncmp) + + movzbl 14(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 14(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 14(%edx) +# endif + jne L(neq_sncmp) + test %cl, %cl + je L(eq_sncmp) + + cmp $15, REM + je L(eq_sncmp) + + movzbl 15(%eax), %ecx +# ifdef USE_AS_STRNCASECMP_L + movzbl 15(%edx), %esi +# ifdef PIC + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower@GOTOFF+128*4(%ebx,%esi,4), %esi +# else + movl _nl_C_LC_CTYPE_tolower+128*4(,%ecx,4), %ecx + movl _nl_C_LC_CTYPE_tolower+128*4(,%esi,4), %esi +# endif + cmpl %ecx, %esi +# else + cmpb %cl, 15(%edx) +# endif + jne L(neq_sncmp) + +# ifdef USE_AS_STRNCASECMP_L +L(eq_sncmp): + POP (%esi) +# endif + POP (REM) +# if defined USE_AS_STRNCASECMP_L && defined PIC + POP (%ebx) +# endif + xor %eax, %eax + ret + +# ifdef USE_AS_STRNCASECMP_L + .p2align 4 +# ifdef PIC + CFI_PUSH (%ebx) +# endif + CFI_PUSH (REM) + CFI_PUSH (%esi) +L(neq_sncmp): + mov $1, %eax + mov $-1, %edx + cmovna %edx, %eax + POP (%esi) + POP (REM) +# ifdef PIC + POP (%ebx) +# endif + ret +# endif +#endif + +END (STRCMP) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S new file mode 100644 index 0000000000..56de25a4b7 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcmp.S @@ -0,0 +1,95 @@ +/* Multiple versions of strcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#ifdef USE_AS_STRNCMP +# define STRCMP strncmp +# define __GI_STRCMP __GI_strncmp +# define __STRCMP_IA32 __strncmp_ia32 +# define __STRCMP_SSSE3 __strncmp_ssse3 +# define __STRCMP_SSE4_2 __strncmp_sse4_2 +#elif defined USE_AS_STRCASECMP_L +# define STRCMP __strcasecmp_l +# define __GI_STRCMP __GI_strcasecmp_l +# define __STRCMP_IA32 __strcasecmp_l_ia32 +# define __STRCMP_SSSE3 __strcasecmp_l_ssse3 +# define __STRCMP_SSE4_2 __strcasecmp_l_sse4_2 +#elif defined USE_AS_STRNCASECMP_L +# define STRCMP __strncasecmp_l +# define __GI_STRCMP __GI_strncasecmp_l +# define __STRCMP_IA32 __strncasecmp_l_ia32 +# define __STRCMP_SSSE3 __strncasecmp_l_ssse3 +# define __STRCMP_SSE4_2 __strncasecmp_l_sse4_2 +#else +# define STRCMP strcmp +# define __GI_STRCMP __GI_strcmp +# define __STRCMP_IA32 __strcmp_ia32 +# define __STRCMP_SSSE3 __strcmp_ssse3 +# define __STRCMP_SSE4_2 __strcmp_sse4_2 +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncmp in static library since we + need strncmp before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc) + .text +ENTRY(STRCMP) + .type STRCMP, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__STRCMP_IA32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2) +2: ret +END(STRCMP) + +# undef ENTRY +# define ENTRY(name) \ + .type __STRCMP_IA32, @function; \ + .p2align 4; \ + .globl __STRCMP_IA32; \ + .hidden __STRCMP_IA32; \ + __STRCMP_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __STRCMP_IA32, .-__STRCMP_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCMP; __GI_STRCMP = __STRCMP_IA32 +# endif +#endif + +#if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L \ + && !defined USE_AS_STRNCASECMP_L +# include "../strcmp.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S new file mode 100644 index 0000000000..ed627a5f62 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-sse2.S @@ -0,0 +1,2250 @@ +/* strcpy with SSE2 and unaligned load + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# include <sysdep.h> + + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCPY +# define STRCPY __strcpy_sse2 +# endif + +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# ifdef USE_AS_STRNCPY +# define PARMS 16 +# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi) +# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; \ + CFI_PUSH(%ebx); CFI_PUSH(%esi); CFI_PUSH(%edi); + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. + INDEX is a register contains the index into the jump table. + SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + SETUP_PIC_REG(cx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjusted ECX. Go. */ \ + jmp *%ecx +# else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + +.text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edi + mov STR2(%esp), %esi + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) + + mov %esi, %ecx +# ifndef USE_AS_STPCPY + mov %edi, %eax /* save result */ +# endif + and $15, %ecx + jz L(SourceStringAlignmentZero) + + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%esi), %xmm1 + add %ecx, %ebx + pmovmskb %xmm1, %edx + shr %cl, %edx +# ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# else + cmp $33, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%edi) + + sub %ecx, %edi + +/* If source address alignment != destination address alignment */ + .p2align 4 +L(Unalign16Both): + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%edi, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%edi, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm1) + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) + + movdqu %xmm3, (%edi, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %edi + lea 128(%ebx, %edx), %ebx + +L(Unaligned64Loop): + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) + test %edx, %edx + jnz L(Unaligned64Leave) +L(Unaligned64Loop_start): + add $64, %edi + add $64, %esi + movdqu %xmm4, -64(%edi) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%edi) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%edi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%edi) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) + test %edx, %edx + jz L(Unaligned64Loop_start) +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) + movdqu %xmm6, 32(%edi) +# ifdef USE_AS_STPCPY + lea 48(%edi, %edx), %eax +# endif + movdqu %xmm7, 48(%edi) + add $15, %ebx + sub %edx, %ebx + lea 49(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + +/* If source address alignment == destination address alignment */ + +L(SourceStringAlignmentZero): + pxor %xmm0, %xmm0 + movdqa (%esi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb 16(%esi), %xmm0 + movdqu %xmm1, (%edi) + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# else + cmp $33, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + jmp L(Unalign16Both) + +/*-----------------End of main part---------------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16BytesTail): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %edi + sub $16, %ebx +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + sub %ecx, %ebx + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx +# ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +# endif + movdqu %xmm4, (%edi) + add $63, %ebx + sub %edx, %ebx + lea 1(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi, %edx), %eax +# endif + movdqu %xmm5, 16(%edi) + add $47, %ebx + sub %edx, %ebx + lea 17(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) +# ifdef USE_AS_STPCPY + lea 32(%edi, %edx), %eax +# endif + movdqu %xmm6, 32(%edi) + add $31, %ebx + sub %edx, %ebx + lea 33(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %edi + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(Exit0): +# ifdef USE_AS_STPCPY + mov %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit1): + movb %dh, (%edi) +# ifdef USE_AS_STPCPY + lea (%edi), %eax +# endif + sub $1, %ebx + lea 1(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit2): + movw (%esi), %dx + movw %dx, (%edi) +# ifdef USE_AS_STPCPY + lea 1(%edi), %eax +# endif + sub $2, %ebx + lea 2(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit3): + movw (%esi), %cx + movw %cx, (%edi) + movb %dh, 2(%edi) +# ifdef USE_AS_STPCPY + lea 2(%edi), %eax +# endif + sub $3, %ebx + lea 3(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit4): + movl (%esi), %edx + movl %edx, (%edi) +# ifdef USE_AS_STPCPY + lea 3(%edi), %eax +# endif + sub $4, %ebx + lea 4(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit5): + movl (%esi), %ecx + movb %dh, 4(%edi) + movl %ecx, (%edi) +# ifdef USE_AS_STPCPY + lea 4(%edi), %eax +# endif + sub $5, %ebx + lea 5(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%edi) + movw %dx, 4(%edi) +# ifdef USE_AS_STPCPY + lea 5(%edi), %eax +# endif + sub $6, %ebx + lea 6(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%edi) + movl %edx, 3(%edi) +# ifdef USE_AS_STPCPY + lea 6(%edi), %eax +# endif + sub $7, %ebx + lea 7(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 7(%edi), %eax +# endif + sub $8, %ebx + lea 8(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit9): + movlpd (%esi), %xmm0 + movb %dh, 8(%edi) + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 8(%edi), %eax +# endif + sub $9, %ebx + lea 9(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 9(%edi), %eax +# endif + sub $10, %ebx + lea 10(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) +# ifdef USE_AS_STPCPY + lea 10(%edi), %eax +# endif + sub $11, %ebx + lea 11(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 11(%edi), %eax +# endif + sub $12, %ebx + lea 12(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 5(%edi) +# ifdef USE_AS_STPCPY + lea 12(%edi), %eax +# endif + sub $13, %ebx + lea 13(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 6(%edi) +# ifdef USE_AS_STPCPY + lea 13(%edi), %eax +# endif + sub $14, %ebx + lea 14(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 7(%edi) +# ifdef USE_AS_STPCPY + lea 14(%edi), %eax +# endif + sub $15, %ebx + lea 15(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 15(%edi), %eax +# endif + sub $16, %ebx + lea 16(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit17): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) + movb %dh, 16(%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi), %eax +# endif + sub $17, %ebx + lea 17(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%edi) + movw %cx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 17(%edi), %eax +# endif + sub $18, %ebx + lea 18(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 15(%edi) +# ifdef USE_AS_STPCPY + lea 18(%edi), %eax +# endif + sub $19, %ebx + lea 19(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 19(%edi), %eax +# endif + sub $20, %ebx + lea 20(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) + movb %dh, 20(%edi) +# ifdef USE_AS_STPCPY + lea 20(%edi), %eax +# endif + sub $21, %ebx + lea 21(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 14(%edi) +# ifdef USE_AS_STPCPY + lea 21(%edi), %eax +# endif + sub $22, %ebx + lea 22(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 15(%edi) +# ifdef USE_AS_STPCPY + lea 22(%edi), %eax +# endif + sub $23, %ebx + lea 23(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 23(%edi), %eax +# endif + sub $24, %ebx + lea 24(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movb %dh, 24(%edi) +# ifdef USE_AS_STPCPY + lea 24(%edi), %eax +# endif + sub $25, %ebx + lea 25(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movw %cx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 25(%edi), %eax +# endif + sub $26, %ebx + lea 26(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 23(%edi) +# ifdef USE_AS_STPCPY + lea 26(%edi), %eax +# endif + sub $27, %ebx + lea 27(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 27(%edi), %eax +# endif + sub $28, %ebx + lea 28(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 13(%edi) +# ifdef USE_AS_STPCPY + lea 28(%edi), %eax +# endif + sub $29, %ebx + lea 29(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 14(%edi) +# ifdef USE_AS_STPCPY + lea 29(%edi), %eax +# endif + sub $30, %ebx + lea 30(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + + .p2align 4 +L(Exit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 15(%edi) +# ifdef USE_AS_STPCPY + lea 30(%edi), %eax +# endif + sub $31, %ebx + lea 31(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 31(%edi), %eax +# endif + sub $32, %ebx + lea 32(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(StrncpyExit1): + movb (%esi), %dl + movb %dl, (%edi) +# ifdef USE_AS_STPCPY + lea 1(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit2): + movw (%esi), %dx + movw %dx, (%edi) +# ifdef USE_AS_STPCPY + lea 2(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit3): + movw (%esi), %cx + movb 2(%esi), %dl + movw %cx, (%edi) + movb %dl, 2(%edi) +# ifdef USE_AS_STPCPY + lea 3(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit4): + movl (%esi), %edx + movl %edx, (%edi) +# ifdef USE_AS_STPCPY + lea 4(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit5): + movl (%esi), %ecx + movb 4(%esi), %dl + movl %ecx, (%edi) + movb %dl, 4(%edi) +# ifdef USE_AS_STPCPY + lea 5(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit6): + movl (%esi), %ecx + movw 4(%esi), %dx + movl %ecx, (%edi) + movw %dx, 4(%edi) +# ifdef USE_AS_STPCPY + lea 6(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit7): + movl (%esi), %ecx + movl 3(%esi), %edx + movl %ecx, (%edi) + movl %edx, 3(%edi) +# ifdef USE_AS_STPCPY + lea 7(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit8): + movlpd (%esi), %xmm0 + movlpd %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 8(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit9): + movlpd (%esi), %xmm0 + movb 8(%esi), %dl + movlpd %xmm0, (%edi) + movb %dl, 8(%edi) +# ifdef USE_AS_STPCPY + lea 9(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit10): + movlpd (%esi), %xmm0 + movw 8(%esi), %dx + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 10(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit11): + movlpd (%esi), %xmm0 + movl 7(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) +# ifdef USE_AS_STPCPY + lea 11(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit12): + movlpd (%esi), %xmm0 + movl 8(%esi), %edx + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) +# ifdef USE_AS_STPCPY + lea 12(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit13): + movlpd (%esi), %xmm0 + movlpd 5(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 5(%edi) +# ifdef USE_AS_STPCPY + lea 13(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit14): + movlpd (%esi), %xmm0 + movlpd 6(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 6(%edi) +# ifdef USE_AS_STPCPY + lea 14(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit15): + movlpd (%esi), %xmm0 + movlpd 7(%esi), %xmm1 + movlpd %xmm0, (%edi) + movlpd %xmm1, 7(%edi) +# ifdef USE_AS_STPCPY + lea 15(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit16): + movdqu (%esi), %xmm0 + movdqu %xmm0, (%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit17): + movdqu (%esi), %xmm0 + movb 16(%esi), %cl + movdqu %xmm0, (%edi) + movb %cl, 16(%edi) +# ifdef USE_AS_STPCPY + lea 17(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit18): + movdqu (%esi), %xmm0 + movw 16(%esi), %cx + movdqu %xmm0, (%edi) + movw %cx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 18(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit19): + movdqu (%esi), %xmm0 + movl 15(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 15(%edi) +# ifdef USE_AS_STPCPY + lea 19(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit20): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) +# ifdef USE_AS_STPCPY + lea 20(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit21): + movdqu (%esi), %xmm0 + movl 16(%esi), %ecx + movb 20(%esi), %dl + movdqu %xmm0, (%edi) + movl %ecx, 16(%edi) + movb %dl, 20(%edi) +# ifdef USE_AS_STPCPY + lea 21(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit22): + movdqu (%esi), %xmm0 + movlpd 14(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 14(%edi) +# ifdef USE_AS_STPCPY + lea 22(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit23): + movdqu (%esi), %xmm0 + movlpd 15(%esi), %xmm3 + movdqu %xmm0, (%edi) + movlpd %xmm3, 15(%edi) +# ifdef USE_AS_STPCPY + lea 23(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit24): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 24(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit25): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movb 24(%esi), %cl + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movb %cl, 24(%edi) +# ifdef USE_AS_STPCPY + lea 25(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit26): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movw 24(%esi), %cx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movw %cx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 26(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit27): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 23(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 23(%edi) +# ifdef USE_AS_STPCPY + lea 27(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit28): + movdqu (%esi), %xmm0 + movlpd 16(%esi), %xmm2 + movl 24(%esi), %ecx + movdqu %xmm0, (%edi) + movlpd %xmm2, 16(%edi) + movl %ecx, 24(%edi) +# ifdef USE_AS_STPCPY + lea 28(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit29): + movdqu (%esi), %xmm0 + movdqu 13(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 13(%edi) +# ifdef USE_AS_STPCPY + lea 29(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit30): + movdqu (%esi), %xmm0 + movdqu 14(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 14(%edi) +# ifdef USE_AS_STPCPY + lea 30(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit31): + movdqu (%esi), %xmm0 + movdqu 15(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 15(%edi) +# ifdef USE_AS_STPCPY + lea 31(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit32): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) +# ifdef USE_AS_STPCPY + lea 32(%edi), %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit33): + movdqu (%esi), %xmm0 + movdqu 16(%esi), %xmm2 + movb 32(%esi), %cl + movdqu %xmm0, (%edi) + movdqu %xmm2, 16(%edi) + movb %cl, 32(%edi) + RETURN + + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + movb %dl, (%edi) + RETURN + + .p2align 4 +L(Fill2): + movw %dx, (%edi) + RETURN + + .p2align 4 +L(Fill3): + movl %edx, -1(%edi) + RETURN + + .p2align 4 +L(Fill4): + movl %edx, (%edi) + RETURN + + .p2align 4 +L(Fill5): + movl %edx, (%edi) + movb %dl, 4(%edi) + RETURN + + .p2align 4 +L(Fill6): + movl %edx, (%edi) + movw %dx, 4(%edi) + RETURN + + .p2align 4 +L(Fill7): + movlpd %xmm0, -1(%edi) + RETURN + + .p2align 4 +L(Fill8): + movlpd %xmm0, (%edi) + RETURN + + .p2align 4 +L(Fill9): + movlpd %xmm0, (%edi) + movb %dl, 8(%edi) + RETURN + + .p2align 4 +L(Fill10): + movlpd %xmm0, (%edi) + movw %dx, 8(%edi) + RETURN + + .p2align 4 +L(Fill11): + movlpd %xmm0, (%edi) + movl %edx, 7(%edi) + RETURN + + .p2align 4 +L(Fill12): + movlpd %xmm0, (%edi) + movl %edx, 8(%edi) + RETURN + + .p2align 4 +L(Fill13): + movlpd %xmm0, (%edi) + movlpd %xmm0, 5(%edi) + RETURN + + .p2align 4 +L(Fill14): + movlpd %xmm0, (%edi) + movlpd %xmm0, 6(%edi) + RETURN + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%edi) + RETURN + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%edi) + RETURN + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%edi, %ecx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %edx, %edx + add $15, %ebx + add %ecx, %edi +# ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +# endif + sub %edx, %ebx + lea 1(%edi, %edx), %edi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %edx, %edx + sub $16, %ebx + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%edi) + add $16, %edi + + mov %edi, %esi + and $0xf, %esi + sub %esi, %edi + add %esi, %ebx + sub $64, %ebx + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%edi) + movdqa %xmm0, 16(%edi) + movdqa %xmm0, 32(%edi) + movdqa %xmm0, 48(%edi) + add $64, %edi + sub $64, %ebx + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %ebx + jl L(StrncpyFillLess32) + movdqa %xmm0, (%edi) + movdqa %xmm0, 16(%edi) + add $32, %edi + sub $16, %ebx + jl L(StrncpyFillExit) + movdqa %xmm0, (%edi) + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + +L(StrncpyFillLess32): + add $16, %ebx + jl L(StrncpyFillExit) + movdqa %xmm0, (%edi) + add $16, %edi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + +L(StrncpyFillExit): + add $16, %ebx + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %ebx, 4) + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %edx, %edx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%ebx), %ecx + and $-16, %ecx + add $48, %ebx + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%edi) + sub $16, %ebx + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%edi) +# ifdef USE_AS_STPCPY + lea 64(%edi), %eax +# endif + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %ecx, %ecx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm4, (%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm5) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm5, 16(%edi) + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm6) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %edx + movdqu %xmm6, 32(%edi) + lea 16(%edi, %ecx), %edi + lea 16(%esi, %ecx), %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(ExitZero): + movl %edi, %eax + RETURN + +END (STRCPY) + + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) + +L(ExitStrncpyTable): + .int JMPTBL(L(Exit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) + + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +# else +# define PARMS 4 +# define ENTRANCE +# define RETURN POP (%edi); ret; CFI_PUSH (%edi) +# define RETURN1 ret + + .text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edx + mov STR2(%esp), %ecx + + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + cmpb $0, 7(%ecx) + jz L(ExitTail8) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + cmpb $0, 14(%ecx) + jz L(ExitTail15) + cmpb $0, 15(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + PUSH (%ebx) + + mov %edx, %edi + lea 16(%ecx), %ebx + and $-16, %ebx + pxor %xmm0, %xmm0 + movdqu (%ecx), %xmm1 + movdqu %xmm1, (%edx) + pcmpeqb (%ebx), %xmm0 + pmovmskb %xmm0, %eax + sub %ecx, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %ecx, %eax + lea 16(%ecx), %ecx + and $-16, %ecx + sub %ecx, %eax + sub %eax, %edx + xor %ebx, %ebx + + .p2align 4 + movdqa (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movdqu %xmm1, (%edx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm3 + movdqu %xmm2, (%edx, %ebx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm4 + movdqu %xmm3, (%edx, %ebx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm1 + movdqu %xmm4, (%edx, %ebx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm2 + movdqu %xmm1, (%edx, %ebx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %ebx), %xmm3 + movdqu %xmm2, (%edx, %ebx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $16, %ebx + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm3, (%edx, %ebx) + mov %ecx, %eax + lea 16(%ecx, %ebx), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps 32(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + add $64, %ecx + pminub %xmm7, %xmm3 + add $64, %edx + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(Aligned64Leave) +L(Aligned64Loop_start): + movdqu %xmm4, -64(%edx) + movaps (%ecx), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%edx) + movaps 16(%ecx), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%ecx), %xmm3 + movdqu %xmm6, -32(%edx) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%edx) + movaps 48(%ecx), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + add $64, %edx + add $64, %ecx + test %eax, %eax + jz L(Aligned64Loop_start) +L(Aligned64Leave): + sub $0xa0, %ebx + pxor %xmm0, %xmm0 + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movdqu %xmm4, -64(%edx) + test %eax, %eax + lea 16(%ebx), %ebx + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movdqu %xmm5, -48(%edx) + test %eax, %eax + lea 16(%ebx), %ebx + jnz L(CopyFrom1To16Bytes) + + movdqu %xmm6, -32(%edx) + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%ebx), %ebx + +/*-----------------End of main part---------------------------*/ + + .p2align 4 +L(CopyFrom1To16Bytes): + add %ebx, %edx + add %ebx, %ecx + + POP (%ebx) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + /* Exit 8 */ + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + /* Exit 16 */ + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm0 + movlpd %xmm0, 8(%edx) +# ifdef USE_AS_STPCPY + lea 15(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) +# ifdef USE_AS_STPCPY + lea (%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) +# ifdef USE_AS_STPCPY + lea 1(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) +# ifdef USE_AS_STPCPY + lea 2(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) +# ifdef USE_AS_STPCPY + lea 3(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) +# ifdef USE_AS_STPCPY + lea 4(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 5(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) +# ifdef USE_AS_STPCPY + lea 6(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit9): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) +# ifdef USE_AS_STPCPY + lea 8(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit10): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 9(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit11): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) +# ifdef USE_AS_STPCPY + lea 10(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit12): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 11(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) +# ifdef USE_AS_STPCPY + lea 12(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) +# ifdef USE_AS_STPCPY + lea 13(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edi, %eax +# endif + RETURN + +CFI_POP (%edi) + + .p2align 4 +L(ExitTail1): + movb (%ecx), %al + movb %al, (%edx) + movl %edx, %eax + RETURN1 + + .p2align 4 +L(ExitTail2): + movw (%ecx), %ax + movw %ax, (%edx) +# ifdef USE_AS_STPCPY + lea 1(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) +# ifdef USE_AS_STPCPY + lea 2(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) +# ifdef USE_AS_STPCPY + lea 3(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) +# ifdef USE_AS_STPCPY + lea 4(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 5(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) +# ifdef USE_AS_STPCPY + lea 6(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail8): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail9): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movb 8(%ecx), %al + movb %al, 8(%edx) +# ifdef USE_AS_STPCPY + lea 8(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail10): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movw 8(%ecx), %ax + movw %ax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 9(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail11): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 7(%ecx), %eax + movl %eax, 7(%edx) +# ifdef USE_AS_STPCPY + lea 10(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail12): + movl (%ecx), %eax + movl %eax, (%edx) + movl 4(%ecx), %eax + movl %eax, 4(%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) +# ifdef USE_AS_STPCPY + lea 11(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail13): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 5(%ecx), %xmm0 + movlpd %xmm0, 5(%edx) +# ifdef USE_AS_STPCPY + lea 12(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail14): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 6(%ecx), %xmm0 + movlpd %xmm0, 6(%edx) +# ifdef USE_AS_STPCPY + lea 13(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail15): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + + .p2align 4 +L(ExitTail16): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 8(%ecx), %xmm0 + movlpd %xmm0, 8(%edx) +# ifdef USE_AS_STPCPY + lea 15(%edx), %eax +# else + movl %edx, %eax +# endif + RETURN1 + +END (STRCPY) +# endif + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S new file mode 100644 index 0000000000..effd85da94 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy-ssse3.S @@ -0,0 +1,3901 @@ +/* strcpy with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + +#if IS_IN (libc) + +# ifndef USE_AS_STRCAT +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCPY +# define STRCPY __strcpy_ssse3 +# endif + +# ifdef USE_AS_STRNCPY +# define PARMS 8 +# define ENTRANCE PUSH (%ebx) +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx); +# define RETURN1 POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi) +# else +# define PARMS 4 +# define ENTRANCE +# define RETURN ret +# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) +# endif + +# ifdef USE_AS_STPCPY +# define SAVE_RESULT(n) lea n(%edx), %eax +# define SAVE_RESULT_TAIL(n) lea n(%edx), %eax +# else +# define SAVE_RESULT(n) movl %edi, %eax +# define SAVE_RESULT_TAIL(n) movl %edx, %eax +# endif + +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +/* In this code following instructions are used for copying: + movb - 1 byte + movw - 2 byte + movl - 4 byte + movlpd - 8 byte + movaps - 16 byte - requires 16 byte alignment + of sourse and destination adresses. +*/ + +.text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edx + mov STR2(%esp), %ecx +# ifdef USE_AS_STRNCPY + movl LEN(%esp), %ebx + cmp $8, %ebx + jbe L(StrncpyExit8Bytes) +# endif + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + cmpb $0, 7(%ecx) + jz L(ExitTail8) +# ifdef USE_AS_STRNCPY + cmp $16, %ebx + jb L(StrncpyExit15Bytes) +# endif + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + cmpb $0, 14(%ecx) + jz L(ExitTail15) +# ifdef USE_AS_STRNCPY + cmp $16, %ebx + je L(ExitTail16) +# endif + cmpb $0, 15(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + mov %edx, %edi +# endif + PUSH (%esi) +# ifdef USE_AS_STRNCPY + mov %ecx, %esi + sub $16, %ebx + and $0xf, %esi + +/* add 16 bytes ecx_offset to ebx */ + + add %esi, %ebx +# endif + lea 16(%ecx), %esi + and $-16, %esi + pxor %xmm0, %xmm0 + movlpd (%ecx), %xmm1 + movlpd %xmm1, (%edx) + + pcmpeqb (%esi), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm1, 8(%edx) + + pmovmskb %xmm0, %eax + sub %ecx, %esi + +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %edx, %eax + lea 16(%edx), %edx + and $-16, %edx + sub %edx, %eax + +# ifdef USE_AS_STRNCPY + add %eax, %esi + lea -1(%esi), %esi + and $1<<31, %esi + test %esi, %esi + jnz L(ContinueCopy) + lea 16(%ebx), %ebx + +L(ContinueCopy): +# endif + sub %eax, %ecx + mov %ecx, %eax + and $0xf, %eax + mov $0, %esi + +/* case: ecx_offset == edx_offset */ + + jz L(Align16Both) + + cmp $8, %eax + jae L(ShlHigh8) + cmp $1, %eax + je L(Shl1) + cmp $2, %eax + je L(Shl2) + cmp $3, %eax + je L(Shl3) + cmp $4, %eax + je L(Shl4) + cmp $5, %eax + je L(Shl5) + cmp $6, %eax + je L(Shl6) + jmp L(Shl7) + +L(ShlHigh8): + je L(Shl8) + cmp $9, %eax + je L(Shl9) + cmp $10, %eax + je L(Shl10) + cmp $11, %eax + je L(Shl11) + cmp $12, %eax + je L(Shl12) + cmp $13, %eax + je L(Shl13) + cmp $14, %eax + je L(Shl14) + jmp L(Shl15) + +L(Align16Both): + movaps (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movaps %xmm1, (%edx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm4 + movaps %xmm3, (%edx, %esi) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm1 + movaps %xmm4, (%edx, %esi) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm2 + movaps %xmm1, (%edx, %esi) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) +# endif + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%edx, %esi) + mov %ecx, %eax + lea 16(%ecx, %esi), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx +# ifdef USE_AS_STRNCPY + lea 112(%ebx, %eax), %ebx +# endif + mov $-0x40, %esi + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps 32(%ecx), %xmm3 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + lea 64(%edx), %edx + pcmpeqb %xmm0, %xmm3 + lea 64(%ecx), %ecx + pmovmskb %xmm3, %eax +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeaveCase2OrCase3) +# endif + test %eax, %eax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%edx) + movaps %xmm5, -48(%edx) + movaps %xmm6, -32(%edx) + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): +# ifdef USE_AS_STRNCPY + lea 48(%ebx), %ebx +# endif + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%edx) + pcmpeqb %xmm7, %xmm0 +# ifdef USE_AS_STRNCPY + lea -16(%ebx), %ebx +# endif + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl1): + movaps -1(%ecx), %xmm1 + movaps 15(%ecx), %xmm2 +L(Shl1Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit1Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl1LoopExit) + + palignr $1, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 31(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -15(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -1(%ecx), %xmm1 + +L(Shl1LoopStart): + movaps 15(%ecx), %xmm2 + movaps 31(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 47(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 63(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + test %eax, %eax + palignr $1, %xmm3, %xmm4 + jnz L(Shl1Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave1) +# endif + palignr $1, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $1, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl1LoopStart) + +L(Shl1LoopExit): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) + mov $15, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl2): + movaps -2(%ecx), %xmm1 + movaps 14(%ecx), %xmm2 +L(Shl2Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit2Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl2LoopExit) + + palignr $2, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 30(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -14(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -2(%ecx), %xmm1 + +L(Shl2LoopStart): + movaps 14(%ecx), %xmm2 + movaps 30(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 46(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 62(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + test %eax, %eax + palignr $2, %xmm3, %xmm4 + jnz L(Shl2Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave2) +# endif + palignr $2, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $2, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl2LoopStart) + +L(Shl2LoopExit): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + mov $14, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl3): + movaps -3(%ecx), %xmm1 + movaps 13(%ecx), %xmm2 +L(Shl3Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit3Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl3LoopExit) + + palignr $3, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 29(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -13(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -3(%ecx), %xmm1 + +L(Shl3LoopStart): + movaps 13(%ecx), %xmm2 + movaps 29(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 45(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 61(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + test %eax, %eax + palignr $3, %xmm3, %xmm4 + jnz L(Shl3Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave3) +# endif + palignr $3, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $3, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl3LoopStart) + +L(Shl3LoopExit): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + mov $13, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl4): + movaps -4(%ecx), %xmm1 + movaps 12(%ecx), %xmm2 +L(Shl4Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit4Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 28(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -12(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -4(%ecx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%ecx), %xmm2 + movaps 28(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %eax, %eax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave4) +# endif + palignr $4, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + mov $12, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl5): + movaps -5(%ecx), %xmm1 + movaps 11(%ecx), %xmm2 +L(Shl5Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit5Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl5LoopExit) + + palignr $5, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 27(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -11(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -5(%ecx), %xmm1 + +L(Shl5LoopStart): + movaps 11(%ecx), %xmm2 + movaps 27(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 43(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 59(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $5, %xmm4, %xmm5 + test %eax, %eax + palignr $5, %xmm3, %xmm4 + jnz L(Shl5Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave5) +# endif + palignr $5, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $5, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl5LoopStart) + +L(Shl5LoopExit): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 7(%edx) + mov $11, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl6): + movaps -6(%ecx), %xmm1 + movaps 10(%ecx), %xmm2 +L(Shl6Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit6Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl6LoopExit) + + palignr $6, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 26(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -10(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -6(%ecx), %xmm1 + +L(Shl6LoopStart): + movaps 10(%ecx), %xmm2 + movaps 26(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 42(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 58(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $6, %xmm4, %xmm5 + test %eax, %eax + palignr $6, %xmm3, %xmm4 + jnz L(Shl6Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave6) +# endif + palignr $6, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $6, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl6LoopStart) + +L(Shl6LoopExit): + movlpd (%ecx), %xmm0 + movl 6(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 6(%edx) + mov $10, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl7): + movaps -7(%ecx), %xmm1 + movaps 9(%ecx), %xmm2 +L(Shl7Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit7Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl7LoopExit) + + palignr $7, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 25(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -9(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -7(%ecx), %xmm1 + +L(Shl7LoopStart): + movaps 9(%ecx), %xmm2 + movaps 25(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 41(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 57(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $7, %xmm4, %xmm5 + test %eax, %eax + palignr $7, %xmm3, %xmm4 + jnz L(Shl7Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave7) +# endif + palignr $7, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $7, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl7LoopStart) + +L(Shl7LoopExit): + movlpd (%ecx), %xmm0 + movl 5(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 5(%edx) + mov $9, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl8): + movaps -8(%ecx), %xmm1 + movaps 8(%ecx), %xmm2 +L(Shl8Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit8Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 24(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -8(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -8(%ecx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%ecx), %xmm2 + movaps 24(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %eax, %eax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave8) +# endif + palignr $8, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + mov $8, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl9): + movaps -9(%ecx), %xmm1 + movaps 7(%ecx), %xmm2 +L(Shl9Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit9Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl9LoopExit) + + palignr $9, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 23(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -7(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -9(%ecx), %xmm1 + +L(Shl9LoopStart): + movaps 7(%ecx), %xmm2 + movaps 23(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 39(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 55(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $9, %xmm4, %xmm5 + test %eax, %eax + palignr $9, %xmm3, %xmm4 + jnz L(Shl9Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave9) +# endif + palignr $9, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $9, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl9LoopStart) + +L(Shl9LoopExit): + movlpd -1(%ecx), %xmm0 + movlpd %xmm0, -1(%edx) + mov $7, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl10): + movaps -10(%ecx), %xmm1 + movaps 6(%ecx), %xmm2 +L(Shl10Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit10Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl10LoopExit) + + palignr $10, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 22(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -6(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -10(%ecx), %xmm1 + +L(Shl10LoopStart): + movaps 6(%ecx), %xmm2 + movaps 22(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 38(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 54(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $10, %xmm4, %xmm5 + test %eax, %eax + palignr $10, %xmm3, %xmm4 + jnz L(Shl10Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave10) +# endif + palignr $10, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $10, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl10LoopStart) + +L(Shl10LoopExit): + movlpd -2(%ecx), %xmm0 + movlpd %xmm0, -2(%edx) + mov $6, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl11): + movaps -11(%ecx), %xmm1 + movaps 5(%ecx), %xmm2 +L(Shl11Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit11Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl11LoopExit) + + palignr $11, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 21(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -5(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -11(%ecx), %xmm1 + +L(Shl11LoopStart): + movaps 5(%ecx), %xmm2 + movaps 21(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 37(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 53(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $11, %xmm4, %xmm5 + test %eax, %eax + palignr $11, %xmm3, %xmm4 + jnz L(Shl11Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave11) +# endif + palignr $11, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $11, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl11LoopStart) + +L(Shl11LoopExit): + movlpd -3(%ecx), %xmm0 + movlpd %xmm0, -3(%edx) + mov $5, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl12): + movaps -12(%ecx), %xmm1 + movaps 4(%ecx), %xmm2 +L(Shl12Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit12Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 20(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -4(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -12(%ecx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%ecx), %xmm2 + movaps 20(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %eax, %eax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave12) +# endif + palignr $12, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movl (%ecx), %esi + movl %esi, (%edx) + mov $4, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl13): + movaps -13(%ecx), %xmm1 + movaps 3(%ecx), %xmm2 +L(Shl13Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit13Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl13LoopExit) + + palignr $13, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 19(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -3(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -13(%ecx), %xmm1 + +L(Shl13LoopStart): + movaps 3(%ecx), %xmm2 + movaps 19(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 35(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 51(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $13, %xmm4, %xmm5 + test %eax, %eax + palignr $13, %xmm3, %xmm4 + jnz L(Shl13Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave13) +# endif + palignr $13, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $13, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl13LoopStart) + +L(Shl13LoopExit): + movl -1(%ecx), %esi + movl %esi, -1(%edx) + mov $3, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl14): + movaps -14(%ecx), %xmm1 + movaps 2(%ecx), %xmm2 +L(Shl14Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit14Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl14LoopExit) + + palignr $14, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 18(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -2(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -14(%ecx), %xmm1 + +L(Shl14LoopStart): + movaps 2(%ecx), %xmm2 + movaps 18(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 34(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 50(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $14, %xmm4, %xmm5 + test %eax, %eax + palignr $14, %xmm3, %xmm4 + jnz L(Shl14Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave14) +# endif + palignr $14, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $14, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl14LoopStart) + +L(Shl14LoopExit): + movl -2(%ecx), %esi + movl %esi, -2(%edx) + mov $2, %esi + jmp L(CopyFrom1To16Bytes) + + .p2align 4 +L(Shl15): + movaps -15(%ecx), %xmm1 + movaps 1(%ecx), %xmm2 +L(Shl15Start): + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm3, %xmm1 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + movaps %xmm3, %xmm1 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + + pcmpeqb %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx +# ifdef USE_AS_STRNCPY + sub $16, %ebx + jbe L(StrncpyExit15Case2OrCase3) +# endif + test %eax, %eax + jnz L(Shl15LoopExit) + + palignr $15, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 17(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -1(%ecx), %ecx + sub %eax, %edx +# ifdef USE_AS_STRNCPY + add %eax, %ebx +# endif + movaps -15(%ecx), %xmm1 + +L(Shl15LoopStart): + movaps 1(%ecx), %xmm2 + movaps 17(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 33(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 49(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqb %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $15, %xmm4, %xmm5 + test %eax, %eax + palignr $15, %xmm3, %xmm4 + jnz L(Shl15Start) +# ifdef USE_AS_STRNCPY + sub $64, %ebx + jbe L(StrncpyLeave15) +# endif + palignr $15, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $15, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl15LoopStart) + +L(Shl15LoopExit): + movl -3(%ecx), %esi + movl %esi, -3(%edx) + mov $1, %esi +# ifdef USE_AS_STRCAT + jmp L(CopyFrom1To16Bytes) +# endif + + +# ifndef USE_AS_STRCAT + + .p2align 4 +L(CopyFrom1To16Bytes): +# ifdef USE_AS_STRNCPY + add $16, %ebx +# endif + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh8) + +L(CopyFrom1To16BytesLess8): + mov %al, %ah + and $15, %ah + jz L(ExitHigh4) + + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT (3) +# ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh4): + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + + .p2align 4 +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + SAVE_RESULT (7) +# ifdef USE_AS_STRNCPY + sub $8, %ebx + lea 8(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh8): + mov %ah, %al + and $15, %al + jz L(ExitHigh12) + + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT (11) +# ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh12): + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + + .p2align 4 +L(Exit16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + SAVE_RESULT (15) +# ifdef USE_AS_STRNCPY + sub $16, %ebx + lea 16(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + +# ifdef USE_AS_STRNCPY + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %esi, %ecx + add %esi, %edx + + POP (%esi) + + test %al, %al + jz L(ExitHighCase2) + + cmp $8, %ebx + ja L(CopyFrom1To16BytesLess8) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %ebx + je L(Exit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %ebx + je L(Exit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %ebx + je L(Exit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %ebx + je L(Exit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %ebx + je L(Exit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %ebx + je L(Exit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %ebx + je L(Exit7) + jmp L(Exit8) + + .p2align 4 +L(ExitHighCase2): + cmp $8, %ebx + jbe L(CopyFrom1To16BytesLess8Case3) + + test $0x01, %ah + jnz L(Exit9) + cmp $9, %ebx + je L(Exit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %ebx + je L(Exit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %ebx + je L(Exit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %ebx + je L(Exit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %ebx + je L(Exit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %ebx + je L(Exit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %ebx + je L(Exit15) + jmp L(Exit16) + + CFI_PUSH(%esi) + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %esi, %edx + add %esi, %ecx + + POP (%esi) + + cmp $8, %ebx + ja L(ExitHigh8Case3) + +L(CopyFrom1To16BytesLess8Case3): + cmp $4, %ebx + ja L(ExitHigh4Case3) + + cmp $1, %ebx + je L(Exit1) + cmp $2, %ebx + je L(Exit2) + cmp $3, %ebx + je L(Exit3) + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT (4) + RETURN1 + + .p2align 4 +L(ExitHigh4Case3): + cmp $5, %ebx + je L(Exit5) + cmp $6, %ebx + je L(Exit6) + cmp $7, %ebx + je L(Exit7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + SAVE_RESULT (8) + RETURN1 + + .p2align 4 +L(ExitHigh8Case3): + cmp $12, %ebx + ja L(ExitHigh12Case3) + + cmp $9, %ebx + je L(Exit9) + cmp $10, %ebx + je L(Exit10) + cmp $11, %ebx + je L(Exit11) + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT (12) + RETURN1 + + .p2align 4 +L(ExitHigh12Case3): + cmp $13, %ebx + je L(Exit13) + cmp $14, %ebx + je L(Exit14) + cmp $15, %ebx + je L(Exit15) + movlpd (%ecx), %xmm0 + movlpd 8(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) + SAVE_RESULT (16) + RETURN1 + +# endif + + .p2align 4 +L(Exit1): + movb (%ecx), %al + movb %al, (%edx) + SAVE_RESULT (0) +# ifdef USE_AS_STRNCPY + sub $1, %ebx + lea 1(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit2): + movw (%ecx), %ax + movw %ax, (%edx) + SAVE_RESULT (1) +# ifdef USE_AS_STRNCPY + sub $2, %ebx + lea 2(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + SAVE_RESULT (2) +# ifdef USE_AS_STRNCPY + sub $3, %ebx + lea 3(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + SAVE_RESULT (4) +# ifdef USE_AS_STRNCPY + sub $5, %ebx + lea 5(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + SAVE_RESULT (5) +# ifdef USE_AS_STRNCPY + sub $6, %ebx + lea 6(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + SAVE_RESULT (6) +# ifdef USE_AS_STRNCPY + sub $7, %ebx + lea 7(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit9): + movlpd (%ecx), %xmm0 + movb 8(%ecx), %al + movlpd %xmm0, (%edx) + movb %al, 8(%edx) + SAVE_RESULT (8) +# ifdef USE_AS_STRNCPY + sub $9, %ebx + lea 9(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit10): + movlpd (%ecx), %xmm0 + movw 8(%ecx), %ax + movlpd %xmm0, (%edx) + movw %ax, 8(%edx) + SAVE_RESULT (9) +# ifdef USE_AS_STRNCPY + sub $10, %ebx + lea 10(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit11): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 7(%edx) + SAVE_RESULT (10) +# ifdef USE_AS_STRNCPY + sub $11, %ebx + lea 11(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit13): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + SAVE_RESULT (12) +# ifdef USE_AS_STRNCPY + sub $13, %ebx + lea 13(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit14): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + SAVE_RESULT (13) +# ifdef USE_AS_STRNCPY + sub $14, %ebx + lea 14(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(Exit15): + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) + SAVE_RESULT (14) +# ifdef USE_AS_STRNCPY + sub $15, %ebx + lea 15(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + +CFI_POP (%edi) + +# ifdef USE_AS_STRNCPY + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + movb %dl, (%ecx) + RETURN + + .p2align 4 +L(Fill2): + movw %dx, (%ecx) + RETURN + + .p2align 4 +L(Fill3): + movw %dx, (%ecx) + movb %dl, 2(%ecx) + RETURN + + .p2align 4 +L(Fill4): + movl %edx, (%ecx) + RETURN + + .p2align 4 +L(Fill5): + movl %edx, (%ecx) + movb %dl, 4(%ecx) + RETURN + + .p2align 4 +L(Fill6): + movl %edx, (%ecx) + movw %dx, 4(%ecx) + RETURN + + .p2align 4 +L(Fill7): + movl %edx, (%ecx) + movl %edx, 3(%ecx) + RETURN + + .p2align 4 +L(Fill8): + movlpd %xmm0, (%ecx) + RETURN + + .p2align 4 +L(Fill9): + movlpd %xmm0, (%ecx) + movb %dl, 8(%ecx) + RETURN + + .p2align 4 +L(Fill10): + movlpd %xmm0, (%ecx) + movw %dx, 8(%ecx) + RETURN + + .p2align 4 +L(Fill11): + movlpd %xmm0, (%ecx) + movl %edx, 7(%ecx) + RETURN + + .p2align 4 +L(Fill12): + movlpd %xmm0, (%ecx) + movl %edx, 8(%ecx) + RETURN + + .p2align 4 +L(Fill13): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 5(%ecx) + RETURN + + .p2align 4 +L(Fill14): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 6(%ecx) + RETURN + + .p2align 4 +L(Fill15): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 7(%ecx) + RETURN + + .p2align 4 +L(Fill16): + movlpd %xmm0, (%ecx) + movlpd %xmm0, 8(%ecx) + RETURN + + .p2align 4 +L(StrncpyFillExit1): + lea 16(%ebx), %ebx +L(FillFrom1To16Bytes): + test %ebx, %ebx + jz L(Fill0) + cmp $16, %ebx + je L(Fill16) + cmp $8, %ebx + je L(Fill8) + jg L(FillMore8) + cmp $4, %ebx + je L(Fill4) + jg L(FillMore4) + cmp $2, %ebx + jl L(Fill1) + je L(Fill2) + jg L(Fill3) +L(FillMore8): /* but less than 16 */ + cmp $12, %ebx + je L(Fill12) + jl L(FillLess12) + cmp $14, %ebx + jl L(Fill13) + je L(Fill14) + jg L(Fill15) +L(FillMore4): /* but less than 8 */ + cmp $6, %ebx + jl L(Fill5) + je L(Fill6) + jg L(Fill7) +L(FillLess12): /* but more than 8 */ + cmp $10, %ebx + jl L(Fill9) + je L(Fill10) + jmp L(Fill11) + + CFI_PUSH(%edi) + + .p2align 4 +L(StrncpyFillTailWithZero1): + POP (%edi) +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %edx, %edx + sub $16, %ebx + jbe L(StrncpyFillExit1) + + movlpd %xmm0, (%ecx) + movlpd %xmm0, 8(%ecx) + + lea 16(%ecx), %ecx + + mov %ecx, %edx + and $0xf, %edx + sub %edx, %ecx + add %edx, %ebx + xor %edx, %edx + sub $64, %ebx + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%ecx) + movdqa %xmm0, 16(%ecx) + movdqa %xmm0, 32(%ecx) + movdqa %xmm0, 48(%ecx) + lea 64(%ecx), %ecx + sub $64, %ebx + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %ebx + jl L(StrncpyFillLess32) + movdqa %xmm0, (%ecx) + movdqa %xmm0, 16(%ecx) + lea 32(%ecx), %ecx + sub $16, %ebx + jl L(StrncpyFillExit1) + movdqa %xmm0, (%ecx) + lea 16(%ecx), %ecx + jmp L(FillFrom1To16Bytes) + +L(StrncpyFillLess32): + add $16, %ebx + jl L(StrncpyFillExit1) + movdqa %xmm0, (%ecx) + lea 16(%ecx), %ecx + jmp L(FillFrom1To16Bytes) +# endif + + .p2align 4 +L(ExitTail1): + movb (%ecx), %al + movb %al, (%edx) + SAVE_RESULT_TAIL (0) +# ifdef USE_AS_STRNCPY + sub $1, %ebx + lea 1(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail2): + movw (%ecx), %ax + movw %ax, (%edx) + SAVE_RESULT_TAIL (1) +# ifdef USE_AS_STRNCPY + sub $2, %ebx + lea 2(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail3): + movw (%ecx), %ax + movw %ax, (%edx) + movb 2(%ecx), %al + movb %al, 2(%edx) + SAVE_RESULT_TAIL (2) +# ifdef USE_AS_STRNCPY + sub $3, %ebx + lea 3(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT_TAIL (3) +# ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail5): + movl (%ecx), %eax + movl %eax, (%edx) + movb 4(%ecx), %al + movb %al, 4(%edx) + SAVE_RESULT_TAIL (4) +# ifdef USE_AS_STRNCPY + sub $5, %ebx + lea 5(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail6): + movl (%ecx), %eax + movl %eax, (%edx) + movw 4(%ecx), %ax + movw %ax, 4(%edx) + SAVE_RESULT_TAIL (5) +# ifdef USE_AS_STRNCPY + sub $6, %ebx + lea 6(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail7): + movl (%ecx), %eax + movl %eax, (%edx) + movl 3(%ecx), %eax + movl %eax, 3(%edx) + SAVE_RESULT_TAIL (6) +# ifdef USE_AS_STRNCPY + sub $7, %ebx + lea 7(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + SAVE_RESULT_TAIL (7) +# ifdef USE_AS_STRNCPY + sub $8, %ebx + lea 8(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# endif + RETURN + + .p2align 4 +L(ExitTail9): + movlpd (%ecx), %xmm0 + movb 8(%ecx), %al + movlpd %xmm0, (%edx) + movb %al, 8(%edx) + SAVE_RESULT_TAIL (8) +# ifdef USE_AS_STRNCPY + sub $9, %ebx + lea 9(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail10): + movlpd (%ecx), %xmm0 + movw 8(%ecx), %ax + movlpd %xmm0, (%edx) + movw %ax, 8(%edx) + SAVE_RESULT_TAIL (9) +# ifdef USE_AS_STRNCPY + sub $10, %ebx + lea 10(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail11): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 7(%edx) + SAVE_RESULT_TAIL (10) +# ifdef USE_AS_STRNCPY + sub $11, %ebx + lea 11(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail12): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT_TAIL (11) +# ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail13): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + SAVE_RESULT_TAIL (12) +# ifdef USE_AS_STRNCPY + sub $13, %ebx + lea 13(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail14): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + SAVE_RESULT_TAIL (13) +# ifdef USE_AS_STRNCPY + sub $14, %ebx + lea 14(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN + + .p2align 4 +L(ExitTail15): + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) + SAVE_RESULT_TAIL (14) +# ifdef USE_AS_STRNCPY + sub $15, %ebx + lea 15(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# endif + RETURN + + .p2align 4 +L(ExitTail16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + SAVE_RESULT_TAIL (15) +# ifdef USE_AS_STRNCPY + sub $16, %ebx + lea 16(%edx), %ecx + jnz L(StrncpyFillTailWithZero) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN +# endif + +# ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + CFI_PUSH (%esi) + CFI_PUSH (%edi) +# endif + .p2align 4 +L(StrncpyLeaveCase2OrCase3): + test %eax, %eax + jnz L(Aligned64LeaveCase2) + +L(Aligned64LeaveCase3): + add $48, %ebx + jle L(CopyFrom1To16BytesCase3) + movaps %xmm4, -64(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm5, -48(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase3) + movaps %xmm6, -32(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + jmp L(CopyFrom1To16BytesCase3) + +L(Aligned64LeaveCase2): + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %eax + add $48, %ebx + jle L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm6, -32(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx + jmp L(CopyFrom1To16BytesCase2) + +/*--------------------------------------------------*/ + .p2align 4 +L(StrncpyExit1Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) + mov $15, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit2Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) + mov $14, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit3Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) + mov $13, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit4Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + mov $12, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit5Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 7(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 7(%edx) + mov $11, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit6Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 6(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 6(%edx) + mov $10, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit7Case2OrCase3): + movlpd (%ecx), %xmm0 + movl 5(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 5(%edx) + mov $9, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit8Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + mov $8, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit9Case2OrCase3): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + mov $7, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit10Case2OrCase3): + movlpd -1(%ecx), %xmm0 + movlpd %xmm0, -1(%edx) + mov $6, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit11Case2OrCase3): + movlpd -2(%ecx), %xmm0 + movlpd %xmm0, -2(%edx) + mov $5, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit12Case2OrCase3): + movl (%ecx), %esi + movl %esi, (%edx) + mov $4, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit13Case2OrCase3): + movl -1(%ecx), %esi + movl %esi, -1(%edx) + mov $3, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit14Case2OrCase3): + movl -2(%ecx), %esi + movl %esi, -2(%edx) + mov $2, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + + .p2align 4 +L(StrncpyExit15Case2OrCase3): + movl -3(%ecx), %esi + movl %esi, -3(%edx) + mov $1, %esi + test %eax, %eax + jnz L(CopyFrom1To16BytesCase2) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave1): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit1) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 31(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + palignr $1, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit1) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit1): + lea 15(%edx, %esi), %edx + lea 15(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave2): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit2) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 30(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + palignr $2, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit2) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit2): + lea 14(%edx, %esi), %edx + lea 14(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave3): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit3) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 29(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + palignr $3, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit3) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit3): + lea 13(%edx, %esi), %edx + lea 13(%ecx, %esi), %ecx + movdqu -16(%ecx), %xmm0 + xor %esi, %esi + movdqu %xmm0, -16(%edx) + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave4): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit4) + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + palignr $4, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit4) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit4): + lea 12(%edx, %esi), %edx + lea 12(%ecx, %esi), %ecx + movlpd -12(%ecx), %xmm0 + movl -4(%ecx), %eax + movlpd %xmm0, -12(%edx) + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave5): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit5) + palignr $5, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 27(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + palignr $5, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit5) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit5): + lea 11(%edx, %esi), %edx + lea 11(%ecx, %esi), %ecx + movlpd -11(%ecx), %xmm0 + movl -4(%ecx), %eax + movlpd %xmm0, -11(%edx) + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave6): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit6) + palignr $6, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 26(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + palignr $6, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit6) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit6): + lea 10(%edx, %esi), %edx + lea 10(%ecx, %esi), %ecx + + movlpd -10(%ecx), %xmm0 + movw -2(%ecx), %ax + movlpd %xmm0, -10(%edx) + movw %ax, -2(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave7): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit7) + palignr $7, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 25(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + palignr $7, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit7) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit7): + lea 9(%edx, %esi), %edx + lea 9(%ecx, %esi), %ecx + + movlpd -9(%ecx), %xmm0 + movb -1(%ecx), %ah + movlpd %xmm0, -9(%edx) + movb %ah, -1(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave8): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit8) + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + palignr $8, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit8) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit8): + lea 8(%edx, %esi), %edx + lea 8(%ecx, %esi), %ecx + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave9): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit9) + palignr $9, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 23(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + palignr $9, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit9) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit9): + lea 7(%edx, %esi), %edx + lea 7(%ecx, %esi), %ecx + + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave10): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit10) + palignr $10, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 22(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + palignr $10, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit10) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit10): + lea 6(%edx, %esi), %edx + lea 6(%ecx, %esi), %ecx + + movlpd -8(%ecx), %xmm0 + movlpd %xmm0, -8(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave11): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit11) + palignr $11, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 21(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + palignr $11, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit11) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit11): + lea 5(%edx, %esi), %edx + lea 5(%ecx, %esi), %ecx + movl -5(%ecx), %esi + movb -1(%ecx), %ah + movl %esi, -5(%edx) + movb %ah, -1(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave12): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit12) + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + palignr $12, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit12) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit12): + lea 4(%edx, %esi), %edx + lea 4(%ecx, %esi), %ecx + movl -4(%ecx), %eax + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave13): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit13) + palignr $13, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 19(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + palignr $13, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit13) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit13): + lea 3(%edx, %esi), %edx + lea 3(%ecx, %esi), %ecx + + movl -4(%ecx), %eax + movl %eax, -4(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave14): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit14) + palignr $14, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 18(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + palignr $14, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit14) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit14): + lea 2(%edx, %esi), %edx + lea 2(%ecx, %esi), %ecx + movw -2(%ecx), %ax + movw %ax, -2(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) + +L(StrncpyLeave15): + movaps %xmm2, %xmm3 + add $48, %ebx + jle L(StrncpyExit15) + palignr $15, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 17(%ecx), %xmm2 + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + palignr $15, %xmm3, %xmm2 + movaps %xmm2, 16(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + movaps %xmm4, 32(%edx) + lea 16(%esi), %esi + sub $16, %ebx + jbe L(StrncpyExit15) + movaps %xmm5, 48(%edx) + lea 16(%esi), %esi + lea -16(%ebx), %ebx +L(StrncpyExit15): + lea 1(%edx, %esi), %edx + lea 1(%ecx, %esi), %ecx + movb -1(%ecx), %ah + movb %ah, -1(%edx) + xor %esi, %esi + jmp L(CopyFrom1To16BytesCase3) +# endif + +# ifndef USE_AS_STRCAT +# ifdef USE_AS_STRNCPY + CFI_POP (%esi) + CFI_POP (%edi) + + .p2align 4 +L(ExitTail0): + movl %edx, %eax + RETURN + + .p2align 4 +L(StrncpyExit15Bytes): + cmp $12, %ebx + jbe L(StrncpyExit12Bytes) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + cmpb $0, 11(%ecx) + jz L(ExitTail12) + cmp $13, %ebx + je L(ExitTail13) + cmpb $0, 12(%ecx) + jz L(ExitTail13) + cmp $14, %ebx + je L(ExitTail14) + cmpb $0, 13(%ecx) + jz L(ExitTail14) + movlpd (%ecx), %xmm0 + movlpd 7(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 7(%edx) +# ifdef USE_AS_STPCPY + lea 14(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax +# else + movl %edx, %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit12Bytes): + cmp $9, %ebx + je L(ExitTail9) + cmpb $0, 8(%ecx) + jz L(ExitTail9) + cmp $10, %ebx + je L(ExitTail10) + cmpb $0, 9(%ecx) + jz L(ExitTail10) + cmp $11, %ebx + je L(ExitTail11) + cmpb $0, 10(%ecx) + jz L(ExitTail11) + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT_TAIL (11) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit8Bytes): + cmp $4, %ebx + jbe L(StrncpyExit4Bytes) + cmpb $0, (%ecx) + jz L(ExitTail1) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + cmpb $0, 3(%ecx) + jz L(ExitTail4) + + cmp $5, %ebx + je L(ExitTail5) + cmpb $0, 4(%ecx) + jz L(ExitTail5) + cmp $6, %ebx + je L(ExitTail6) + cmpb $0, 5(%ecx) + jz L(ExitTail6) + cmp $7, %ebx + je L(ExitTail7) + cmpb $0, 6(%ecx) + jz L(ExitTail7) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) +# ifdef USE_AS_STPCPY + lea 7(%edx), %eax + cmpb $1, (%eax) + sbb $-1, %eax +# else + movl %edx, %eax +# endif + RETURN + + .p2align 4 +L(StrncpyExit4Bytes): + test %ebx, %ebx + jz L(ExitTail0) + cmp $1, %ebx + je L(ExitTail1) + cmpb $0, (%ecx) + jz L(ExitTail1) + cmp $2, %ebx + je L(ExitTail2) + cmpb $0, 1(%ecx) + jz L(ExitTail2) + cmp $3, %ebx + je L(ExitTail3) + cmpb $0, 2(%ecx) + jz L(ExitTail3) + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT_TAIL (3) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif + RETURN +# endif + +END (STRCPY) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S new file mode 100644 index 0000000000..ffbc03c6d5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcpy.S @@ -0,0 +1,116 @@ +/* Multiple versions of strcpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define STRCPY_IA32 __stpncpy_ia32 +# define __GI_STRCPY __GI_stpncpy +# define __GI___STRCPY __GI___stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define STRCPY_IA32 __stpcpy_ia32 +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define STRCPY_IA32 __strncpy_ia32 +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define STRCPY_IA32 __strcpy_ia32 +# define __GI_STRCPY __GI_strcpy +# endif +#endif + + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strncpy in static library since we + need strncpy before the initialization happened. */ +#if IS_IN (libc) + + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (STRCPY_IA32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (STRCPY_SSE2) + HAS_ARCH_FEATURE (Fast_Unaligned_Load) + jnz 2f + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (STRCPY_SSSE3) +2: ret +END(STRCPY) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_IA32, @function; \ + .align 16; \ + .globl STRCPY_IA32; \ + .hidden STRCPY_IA32; \ + STRCPY_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_IA32, .-STRCPY_IA32 + +# ifdef SHARED +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_IA32 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_IA32 + +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# include "../../stpncpy.S" +# else +# include "../../i586/stpcpy.S" +# endif +#else +# ifndef USE_AS_STRNCPY +# include "../../i586/strcpy.S" +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c new file mode 100644 index 0000000000..6d61e190a8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn-c.c @@ -0,0 +1,2 @@ +#define __strcspn_sse2 __strcspn_ia32 +#include <sysdeps/x86_64/multiarch/strcspn-c.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S new file mode 100644 index 0000000000..21e5093924 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strcspn.S @@ -0,0 +1,75 @@ +/* Multiple versions of strcspn + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <init-arch.h> + +#ifdef USE_AS_STRPBRK +#define STRCSPN_SSE42 __strpbrk_sse42 +#define STRCSPN_IA32 __strpbrk_ia32 +#define __GI_STRCSPN __GI_strpbrk +#else +#ifndef STRCSPN +#define STRCSPN strcspn +#define STRCSPN_SSE42 __strcspn_sse42 +#define STRCSPN_IA32 __strcspn_ia32 +#define __GI_STRCSPN __GI_strcspn +#endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc) + .text +ENTRY(STRCSPN) + .type STRCSPN, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (STRCSPN_IA32) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (STRCSPN_SSE42) +2: ret +END(STRCSPN) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCSPN_IA32, @function; \ + .globl STRCSPN_IA32; \ + .p2align 4; \ + STRCSPN_IA32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCSPN_IA32, .-STRCSPN_IA32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_IA32 +#endif + +#ifdef USE_AS_STRPBRK +#include "../../strpbrk.S" +#else +#include "../../strcspn.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S new file mode 100644 index 0000000000..d3ea864bab --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S @@ -0,0 +1,125 @@ +/* strlen with SSE2 and BSF + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if defined SHARED && IS_IN (libc) + +#include <sysdep.h> + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) +#define PARMS 4 + 8 /* Preserve ESI and EDI. */ +#define STR PARMS +#define ENTRANCE PUSH (%esi); PUSH (%edi); cfi_remember_state +#define RETURN POP (%edi); POP (%esi); ret; \ + cfi_restore_state; cfi_remember_state + + .text +ENTRY ( __strlen_sse2_bsf) + ENTRANCE + mov STR(%esp), %edi + xor %eax, %eax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%edi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %edi, %eax + and $-16, %eax + jmp L(align16_start) +L(next): + + mov %edi, %eax + and $-16, %eax + pcmpeqb (%eax), %xmm0 + mov $-1, %esi + sub %eax, %ecx + shl %cl, %esi + pmovmskb %xmm0, %edx + and %esi, %edx + jnz L(exit) +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + .p2align 4 +L(align16_loop): + pcmpeqb 16(%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%eax), %xmm3 + pmovmskb %xmm3, %edx + lea 64(%eax), %eax + test %edx, %edx + jz L(align16_loop) +L(exit): + sub %edi, %eax +L(exit_less16): + bsf %edx, %edx + add %edx, %eax + RETURN +L(exit16): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $16, %eax + RETURN +L(exit32): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $32, %eax + RETURN +L(exit48): + sub %edi, %eax + bsf %edx, %edx + add %edx, %eax + add $48, %eax + POP (%edi) + POP (%esi) + ret + +END ( __strlen_sse2_bsf) + +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S new file mode 100644 index 0000000000..36fc1469d0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen-sse2.S @@ -0,0 +1,695 @@ +/* strlen with SSE2 + Copyright (C) 2010-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* for strlen only SHARED version is optimized, for strcat, strncat, strnlen both STATIC and SHARED are optimized */ + +#if (defined USE_AS_STRNLEN || defined USE_AS_STRCAT || defined SHARED) && IS_IN (libc) + +# ifndef USE_AS_STRCAT + +# include <sysdep.h> +# define PARMS 4 +# define STR PARMS +# define RETURN ret + +# ifdef USE_AS_STRNLEN +# define LEN PARMS + 8 +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) +# undef RETURN +# define RETURN POP (%edi); CFI_PUSH(%edi); ret +# endif + +# ifndef STRLEN +# define STRLEN __strlen_sse2 +# endif + + atom_text_section +ENTRY (STRLEN) + mov STR(%esp), %edx +# ifdef USE_AS_STRNLEN + PUSH (%edi) + movl LEN(%esp), %edi + sub $4, %edi + jbe L(len_less4_prolog) +# endif +# endif + xor %eax, %eax + cmpb $0, (%edx) + jz L(exit_tail0) + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmpb $0, 3(%edx) + jz L(exit_tail3) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less8_prolog) +# endif + + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmpb $0, 7(%edx) + jz L(exit_tail7) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less12_prolog) +# endif + + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmpb $0, 11(%edx) + jz L(exit_tail11) + +# ifdef USE_AS_STRNLEN + sub $4, %edi + jbe L(len_less16_prolog) +# endif + + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmpb $0, 15(%edx) + jz L(exit_tail15) + + pxor %xmm0, %xmm0 + lea 16(%edx), %eax + mov %eax, %ecx + and $-16, %eax + +# ifdef USE_AS_STRNLEN + and $15, %edx + add %edx, %edi + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqb (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + mov %eax, %edx + and $63, %edx + add %edx, %edi +# endif + + and $-0x40, %eax + + .p2align 4 +L(aligned_64_loop): +# ifdef USE_AS_STRNLEN + sub $64, %edi + jbe L(len_less64) +# endif + movaps (%eax), %xmm0 + movaps 16(%eax), %xmm1 + movaps 32(%eax), %xmm2 + movaps 48(%eax), %xmm6 + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqb %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%eax), %eax + jz L(aligned_64_loop) + + pcmpeqb -64(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%ecx), %ecx + jnz L(exit) + + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqb -32(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqb %xmm6, %xmm3 + pmovmskb %xmm3, %edx + lea -16(%ecx), %ecx +L(exit): + sub %ecx, %eax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(exit_tail1) + test $0x04, %dl + jnz L(exit_tail2) + add $3, %eax + RETURN + + .p2align 4 +L(exit_8): + test $0x10, %dl + jnz L(exit_tail4) + test $0x20, %dl + jnz L(exit_tail5) + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax + RETURN + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_high_8) + test $0x01, %dh + jnz L(exit_tail8) + test $0x02, %dh + jnz L(exit_tail9) + test $0x04, %dh + jnz L(exit_tail10) + add $11, %eax + RETURN + + .p2align 4 +L(exit_high_8): + test $0x10, %dh + jnz L(exit_tail12) + test $0x20, %dh + jnz L(exit_tail13) + test $0x40, %dh + jnz L(exit_tail14) + add $15, %eax +L(exit_tail0): + RETURN + +# ifdef USE_AS_STRNLEN + + .p2align 4 +L(len_less64): + pxor %xmm0, %xmm0 + add $64, %edi + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm0 + pmovmskb %xmm0, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %edi + jbe L(return_start_len) + + pcmpeqb (%eax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%eax), %eax + test %edx, %edx + jnz L(strnlen_exit) + + movl LEN(%esp), %eax + RETURN + + .p2align 4 +L(strnlen_exit): + sub %ecx, %eax + + test %dl, %dl + jz L(strnlen_exit_high) + mov %dl, %cl + and $15, %cl + jz L(strnlen_exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(strnlen_exit_tail1) + test $0x04, %dl + jnz L(strnlen_exit_tail2) + sub $4, %edi + jb L(return_start_len) + lea 3(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_8): + test $0x10, %dl + jnz L(strnlen_exit_tail4) + test $0x20, %dl + jnz L(strnlen_exit_tail5) + test $0x40, %dl + jnz L(strnlen_exit_tail6) + sub $8, %edi + jb L(return_start_len) + lea 7(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_high): + mov %dh, %ch + and $15, %ch + jz L(strnlen_exit_high_8) + test $0x01, %dh + jnz L(strnlen_exit_tail8) + test $0x02, %dh + jnz L(strnlen_exit_tail9) + test $0x04, %dh + jnz L(strnlen_exit_tail10) + sub $12, %edi + jb L(return_start_len) + lea 11(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_high_8): + test $0x10, %dh + jnz L(strnlen_exit_tail12) + test $0x20, %dh + jnz L(strnlen_exit_tail13) + test $0x40, %dh + jnz L(strnlen_exit_tail14) + sub $16, %edi + jb L(return_start_len) + lea 15(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail1): + sub $2, %edi + jb L(return_start_len) + lea 1(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail2): + sub $3, %edi + jb L(return_start_len) + lea 2(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail4): + sub $5, %edi + jb L(return_start_len) + lea 4(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail5): + sub $6, %edi + jb L(return_start_len) + lea 5(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail6): + sub $7, %edi + jb L(return_start_len) + lea 6(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail8): + sub $9, %edi + jb L(return_start_len) + lea 8(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail9): + sub $10, %edi + jb L(return_start_len) + lea 9(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail10): + sub $11, %edi + jb L(return_start_len) + lea 10(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail12): + sub $13, %edi + jb L(return_start_len) + lea 12(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail13): + sub $14, %edi + jb L(return_start_len) + lea 13(%eax), %eax + RETURN + + .p2align 4 +L(strnlen_exit_tail14): + sub $15, %edi + jb L(return_start_len) + lea 14(%eax), %eax + RETURN + + .p2align 4 +L(return_start_len): + movl LEN(%esp), %eax + RETURN + +/* for prolog only */ + + .p2align 4 +L(len_less4_prolog): + xor %eax, %eax + + add $4, %edi + jz L(exit_tail0) + + cmpb $0, (%edx) + jz L(exit_tail0) + cmp $1, %edi + je L(exit_tail1) + + cmpb $0, 1(%edx) + jz L(exit_tail1) + cmp $2, %edi + je L(exit_tail2) + + cmpb $0, 2(%edx) + jz L(exit_tail2) + cmp $3, %edi + je L(exit_tail3) + + cmpb $0, 3(%edx) + jz L(exit_tail3) + mov $4, %eax + RETURN + + .p2align 4 +L(len_less8_prolog): + add $4, %edi + + cmpb $0, 4(%edx) + jz L(exit_tail4) + cmp $1, %edi + je L(exit_tail5) + + cmpb $0, 5(%edx) + jz L(exit_tail5) + cmp $2, %edi + je L(exit_tail6) + + cmpb $0, 6(%edx) + jz L(exit_tail6) + cmp $3, %edi + je L(exit_tail7) + + cmpb $0, 7(%edx) + jz L(exit_tail7) + mov $8, %eax + RETURN + + + .p2align 4 +L(len_less12_prolog): + add $4, %edi + + cmpb $0, 8(%edx) + jz L(exit_tail8) + cmp $1, %edi + je L(exit_tail9) + + cmpb $0, 9(%edx) + jz L(exit_tail9) + cmp $2, %edi + je L(exit_tail10) + + cmpb $0, 10(%edx) + jz L(exit_tail10) + cmp $3, %edi + je L(exit_tail11) + + cmpb $0, 11(%edx) + jz L(exit_tail11) + mov $12, %eax + RETURN + + .p2align 4 +L(len_less16_prolog): + add $4, %edi + + cmpb $0, 12(%edx) + jz L(exit_tail12) + cmp $1, %edi + je L(exit_tail13) + + cmpb $0, 13(%edx) + jz L(exit_tail13) + cmp $2, %edi + je L(exit_tail14) + + cmpb $0, 14(%edx) + jz L(exit_tail14) + cmp $3, %edi + je L(exit_tail15) + + cmpb $0, 15(%edx) + jz L(exit_tail15) + mov $16, %eax + RETURN +# endif + + .p2align 4 +L(exit_tail1): + add $1, %eax + RETURN + +L(exit_tail2): + add $2, %eax + RETURN + +L(exit_tail3): + add $3, %eax + RETURN + +L(exit_tail4): + add $4, %eax + RETURN + +L(exit_tail5): + add $5, %eax + RETURN + +L(exit_tail6): + add $6, %eax + RETURN + +L(exit_tail7): + add $7, %eax + RETURN + +L(exit_tail8): + add $8, %eax + RETURN + +L(exit_tail9): + add $9, %eax + RETURN + +L(exit_tail10): + add $10, %eax + RETURN + +L(exit_tail11): + add $11, %eax + RETURN + +L(exit_tail12): + add $12, %eax + RETURN + +L(exit_tail13): + add $13, %eax + RETURN + +L(exit_tail14): + add $14, %eax + RETURN + +L(exit_tail15): + add $15, %eax +# ifndef USE_AS_STRCAT + RETURN +END (STRLEN) +# endif +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S new file mode 100644 index 0000000000..77cf6bcdb0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strlen.S @@ -0,0 +1,60 @@ +/* Multiple versions of strlen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc and for the + DSO. In static binaries, we need strlen before the initialization + happened. */ +#if defined SHARED && IS_IN (libc) + .text +ENTRY(strlen) + .type strlen, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strlen_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf) + HAS_ARCH_FEATURE (Slow_BSF) + jz 2f + LOAD_FUNC_GOT_EAX (__strlen_sse2) +2: ret +END(strlen) + +# undef ENTRY +# define ENTRY(name) \ + .type __strlen_ia32, @function; \ + .globl __strlen_ia32; \ + .p2align 4; \ + __strlen_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strlen_ia32, .-__strlen_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strlen; __GI_strlen = __strlen_ia32 +#endif + +#include "../../i586/strlen.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c new file mode 100644 index 0000000000..76581eb62b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase-c.c @@ -0,0 +1,8 @@ +#include <string.h> + +extern __typeof (strncasecmp) __strncasecmp_nonascii; + +#define __strncasecmp __strncasecmp_nonascii +#include <string/strncase.c> + +strong_alias (__strncasecmp_nonascii, __strncasecmp_ia32) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S new file mode 100644 index 0000000000..a56e63a566 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase.S @@ -0,0 +1,39 @@ +/* Entry point for multi-version x86 strncasecmp. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY(__strncasecmp) + .type __strncasecmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strncasecmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + HAS_ARCH_FEATURE (Slow_SSE4_2) + jnz 2f + LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2) +2: ret +END(__strncasecmp) + +weak_alias (__strncasecmp, strncasecmp) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c new file mode 100644 index 0000000000..7e601af271 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-c.c @@ -0,0 +1,13 @@ +#include <string.h> + +extern __typeof (strncasecmp_l) __strncasecmp_l_nonascii; + +#define __strncasecmp_l __strncasecmp_l_nonascii +#define USE_IN_EXTENDED_LOCALE_MODEL 1 +#include <string/strncase.c> + +strong_alias (__strncasecmp_l_nonascii, __strncasecmp_l_ia32) + +/* The needs of strcasecmp in libc are minimal, no need to go through + the IFUNC. */ +strong_alias (__strncasecmp_l_nonascii, __GI___strncasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S new file mode 100644 index 0000000000..557210832e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-sse4.S @@ -0,0 +1,2 @@ +#define USE_AS_STRNCASECMP_L 1 +#include "strcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S new file mode 100644 index 0000000000..d438a1ae35 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l-ssse3.S @@ -0,0 +1,2 @@ +#define USE_AS_STRNCASECMP_L 1 +#include "strcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S new file mode 100644 index 0000000000..8a74ee8574 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncase_l.S @@ -0,0 +1,7 @@ +/* Multiple versions of strncasecmp_l + All versions must be listed in ifunc-impl-list.c. */ +#define STRCMP __strncasecmp_l +#define USE_AS_STRNCASECMP_L +#include "strcmp.S" + +weak_alias (__strncasecmp_l, strncasecmp_l) diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c new file mode 100644 index 0000000000..132a000545 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-c.c @@ -0,0 +1,8 @@ +#define STRNCAT __strncat_ia32 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__strncat_ia32, __GI___strncat, __strncat_ia32); +#endif + +#include "string/strncat.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S new file mode 100644 index 0000000000..f1045b72b8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-sse2.S @@ -0,0 +1,4 @@ +#define STRCAT __strncat_sse2 +#define USE_AS_STRNCAT + +#include "strcat-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S new file mode 100644 index 0000000000..625b90a978 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat-ssse3.S @@ -0,0 +1,4 @@ +#define STRCAT __strncat_ssse3 +#define USE_AS_STRNCAT + +#include "strcat-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S new file mode 100644 index 0000000000..5c1bf41453 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncat.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncat + All versions must be listed in ifunc-impl-list.c. */ +#define STRCAT strncat +#define USE_AS_STRNCAT +#include "strcat.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c new file mode 100644 index 0000000000..cc059da494 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-c.c @@ -0,0 +1,8 @@ +#ifdef SHARED +# define STRNCMP __strncmp_ia32 +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncmp_ia32, __GI_strncmp, __strncmp_ia32); +#endif + +#include "string/strncmp.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S new file mode 100644 index 0000000000..cf14dfaf6c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-sse4.S @@ -0,0 +1,5 @@ +#ifdef SHARED +# define USE_AS_STRNCMP +# define STRCMP __strncmp_sse4_2 +# include "strcmp-sse4.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S new file mode 100644 index 0000000000..536c8685f2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp-ssse3.S @@ -0,0 +1,5 @@ +#ifdef SHARED +# define USE_AS_STRNCMP +# define STRCMP __strncmp_ssse3 +# include "strcmp-ssse3.S" +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S new file mode 100644 index 0000000000..150d4786d2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncmp.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncmp + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STRNCMP +#define STRCMP strncmp +#include "strcmp.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c new file mode 100644 index 0000000000..201e3f98b3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_ia32 +#ifdef SHARED +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_ia32, __GI_strncpy, __strncpy_ia32); +#endif + +#include "string/strncpy.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S new file mode 100644 index 0000000000..bdd99239a4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_sse2 +#include "strcpy-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S new file mode 100644 index 0000000000..bf82ee447d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S new file mode 100644 index 0000000000..9c257efc6e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strncpy.S @@ -0,0 +1,5 @@ +/* Multiple versions of strncpy + All versions must be listed in ifunc-impl-list.c. */ +#define USE_AS_STRNCPY +#define STRCPY strncpy +#include "strcpy.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c new file mode 100644 index 0000000000..351e939a93 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-c.c @@ -0,0 +1,10 @@ +#define STRNLEN __strnlen_ia32 +#ifdef SHARED +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__strnlen_ia32, __GI_strnlen, __strnlen_ia32); \ + strong_alias (__strnlen_ia32, __strnlen_ia32_1); \ + __hidden_ver1 (__strnlen_ia32_1, __GI___strnlen, __strnlen_ia32_1); +#endif + +#include "string/strnlen.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S new file mode 100644 index 0000000000..56b6ae2a5c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNLEN +#define STRLEN __strnlen_sse2 +#include "strlen-sse2.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S new file mode 100644 index 0000000000..d241522c70 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strnlen.S @@ -0,0 +1,37 @@ +/* Multiple versions of strnlen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__strnlen) + .type __strnlen, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strnlen_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strnlen_sse2) +2: ret +END(__strnlen) + +weak_alias(__strnlen, strnlen) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c new file mode 100644 index 0000000000..5db62053b3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk-c.c @@ -0,0 +1,2 @@ +#define __strpbrk_sse2 __strpbrk_ia32 +#include <sysdeps/x86_64/multiarch/strpbrk-c.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S new file mode 100644 index 0000000000..7201d6376f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strpbrk.S @@ -0,0 +1,5 @@ +/* Multiple versions of strpbrk + All versions must be listed in ifunc-impl-list.c. */ +#define STRCSPN strpbrk +#define USE_AS_STRPBRK +#include "strcspn.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S new file mode 100644 index 0000000000..39a7c8825b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S @@ -0,0 +1,282 @@ +/* strrchr with SSE2 with bsf and bsr + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + + .text +ENTRY (__strrchr_sse2_bsf) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + PUSH (%edi) + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + pshufd $0, %xmm1, %xmm1 + ja L(crosscashe) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + + test %eax, %eax + jnz L(unaligned_match1) + + test %edx, %edx + jnz L(return_null) + + and $-16, %edi + add $16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_return_value1): + bsf %edx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_null) + bsr %eax, %eax + add %edi, %eax + POP (%edi) + ret + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_match1): + test %edx, %edx + jnz L(unaligned_return_value1) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + lea 16(%edi), %esi + and $-16, %edi + add $16, %edi + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 + L(crosscashe): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm3, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + add $16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_return_value): + add %ecx, %edi + bsf %edx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_null) + bsr %eax, %eax + add %edi, %eax + POP (%edi) + ret + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(unaligned_return_value) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + add $16, %edi + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jz L(loop) + +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %ebx, %ebx + jz L(return_null_1) + bsr %ebx, %eax + add %esi, %eax + + POP (%ebx) + POP (%esi) + + sub $16, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(return_value_1) + mov %eax, %ebx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(return_value_1): + bsf %ecx, %ecx + mov $2, %edx + shl %cl, %edx + sub $1, %edx + and %edx, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + + bsr %eax, %eax + add %edi, %eax + sub $16, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + POP (%edi) + ret + + CFI_PUSH (%edi) + CFI_PUSH (%ebx) + CFI_PUSH (%esi) +/* Return NULL. */ + .p2align 4 +L(return_null_1): + POP (%ebx) + POP (%esi) + POP (%edi) + xor %eax, %eax + ret + +END (__strrchr_sse2_bsf) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S new file mode 100644 index 0000000000..20934288be --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr-sse2.S @@ -0,0 +1,708 @@ +/* strrchr SSE2 without bsf and bsr + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH(%edi); +# define RETURN POP(%edi); ret; CFI_PUSH(%edi); + +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__strrchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + pxor %xmm2, %xmm2 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + /* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + pshufd $0, %xmm1, %xmm1 + ja L(crosscache) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm2, %ecx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match1) + + test %ecx, %ecx + jnz L(return_null) + + and $-16, %edi + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_match1): + test %ecx, %ecx + jnz L(prolog_find_zero_1) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + mov %edi, %esi + and $-16, %edi + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(crosscache): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm3 + pcmpeqb %xmm1, %xmm0 + /* Find where NULL is. */ + pmovmskb %xmm3, %edx + /* Check if there is a match. */ + pmovmskb %xmm0, %eax + /* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + PUSH (%esi) + PUSH (%ebx) + + xor %ebx, %ebx + jmp L(loop) + + CFI_POP (%esi) + CFI_POP (%ebx) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(prolog_find_zero) + + PUSH (%esi) + PUSH (%ebx) + + mov %eax, %ebx + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm0, %xmm2 + add $16, %edi + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jz L(loop) + +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %ebx, %ebx + jz L(return_null_1) + mov %ebx, %eax + mov %esi, %edi + + POP (%ebx) + POP (%esi) + + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(return_null_1): + POP (%ebx) + POP (%esi) + + xor %eax, %eax + RETURN + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(find_zero) + mov %eax, %ebx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(find_zero): + test %cl, %cl + jz L(find_zero_high) + mov %cl, %dl + and $15, %dl + jz L(find_zero_8) + test $0x01, %cl + jnz L(FindZeroExit1) + test $0x02, %cl + jnz L(FindZeroExit2) + test $0x04, %cl + jnz L(FindZeroExit3) + and $1 << 4 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_8): + test $0x10, %cl + jnz L(FindZeroExit5) + test $0x20, %cl + jnz L(FindZeroExit6) + test $0x40, %cl + jnz L(FindZeroExit7) + and $1 << 8 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_high): + mov %ch, %dh + and $15, %dh + jz L(find_zero_high_8) + test $0x01, %ch + jnz L(FindZeroExit9) + test $0x02, %ch + jnz L(FindZeroExit10) + test $0x04, %ch + jnz L(FindZeroExit11) + and $1 << 12 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_high_8): + test $0x10, %ch + jnz L(FindZeroExit13) + test $0x20, %ch + jnz L(FindZeroExit14) + test $0x40, %ch + jnz L(FindZeroExit15) + and $1 << 16 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit1): + and $1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit2): + and $1 << 2 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit3): + and $1 << 3 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit5): + and $1 << 5 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit6): + and $1 << 6 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit7): + and $1 << 7 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit9): + and $1 << 9 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit10): + and $1 << 10 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit11): + and $1 << 11 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit13): + and $1 << 13 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit14): + and $1 << 14 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + jmp L(match_exit) + + CFI_PUSH (%ebx) + CFI_PUSH (%esi) + + .p2align 4 +L(FindZeroExit15): + and $1 << 15 - 1, %eax + jz L(return_value) + + POP (%ebx) + POP (%esi) + + .p2align 4 +L(match_exit): + test %ah, %ah + jnz L(match_exit_high) + mov %al, %dl + and $15 << 4, %dl + jnz L(match_exit_8) + test $0x08, %al + jnz L(Exit4) + test $0x04, %al + jnz L(Exit3) + test $0x02, %al + jnz L(Exit2) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(match_exit_8): + test $0x80, %al + jnz L(Exit8) + test $0x40, %al + jnz L(Exit7) + test $0x20, %al + jnz L(Exit6) + lea -12(%edi), %eax + RETURN + + .p2align 4 +L(match_exit_high): + mov %ah, %dh + and $15 << 4, %dh + jnz L(match_exit_high_8) + test $0x08, %ah + jnz L(Exit12) + test $0x04, %ah + jnz L(Exit11) + test $0x02, %ah + jnz L(Exit10) + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_exit_high_8): + test $0x80, %ah + jnz L(Exit16) + test $0x40, %ah + jnz L(Exit15) + test $0x20, %ah + jnz L(Exit14) + lea -4(%edi), %eax + RETURN + + .p2align 4 +L(Exit2): + lea -15(%edi), %eax + RETURN + + .p2align 4 +L(Exit3): + lea -14(%edi), %eax + RETURN + + .p2align 4 +L(Exit4): + lea -13(%edi), %eax + RETURN + + .p2align 4 +L(Exit6): + lea -11(%edi), %eax + RETURN + + .p2align 4 +L(Exit7): + lea -10(%edi), %eax + RETURN + + .p2align 4 +L(Exit8): + lea -9(%edi), %eax + RETURN + + .p2align 4 +L(Exit10): + lea -7(%edi), %eax + RETURN + + .p2align 4 +L(Exit11): + lea -6(%edi), %eax + RETURN + + .p2align 4 +L(Exit12): + lea -5(%edi), %eax + RETURN + + .p2align 4 +L(Exit14): + lea -3(%edi), %eax + RETURN + + .p2align 4 +L(Exit15): + lea -2(%edi), %eax + RETURN + + .p2align 4 +L(Exit16): + lea -1(%edi), %eax + RETURN + +/* Return NULL. */ + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero): + add %ecx, %edi + mov %edx, %ecx +L(prolog_find_zero_1): + test %cl, %cl + jz L(prolog_find_zero_high) + mov %cl, %dl + and $15, %dl + jz L(prolog_find_zero_8) + test $0x01, %cl + jnz L(PrologFindZeroExit1) + test $0x02, %cl + jnz L(PrologFindZeroExit2) + test $0x04, %cl + jnz L(PrologFindZeroExit3) + and $1 << 4 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero_8): + test $0x10, %cl + jnz L(PrologFindZeroExit5) + test $0x20, %cl + jnz L(PrologFindZeroExit6) + test $0x40, %cl + jnz L(PrologFindZeroExit7) + and $1 << 8 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero_high): + mov %ch, %dh + and $15, %dh + jz L(prolog_find_zero_high_8) + test $0x01, %ch + jnz L(PrologFindZeroExit9) + test $0x02, %ch + jnz L(PrologFindZeroExit10) + test $0x04, %ch + jnz L(PrologFindZeroExit11) + and $1 << 12 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero_high_8): + test $0x10, %ch + jnz L(PrologFindZeroExit13) + test $0x20, %ch + jnz L(PrologFindZeroExit14) + test $0x40, %ch + jnz L(PrologFindZeroExit15) + and $1 << 16 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit1): + and $1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit2): + and $1 << 2 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit3): + and $1 << 3 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit5): + and $1 << 5 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit6): + and $1 << 6 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit7): + and $1 << 7 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit9): + and $1 << 9 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit10): + and $1 << 10 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit11): + and $1 << 11 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit13): + and $1 << 13 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit14): + and $1 << 14 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + + .p2align 4 +L(PrologFindZeroExit15): + and $1 << 15 - 1, %eax + jnz L(match_exit) + xor %eax, %eax + RETURN + +END (__strrchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S new file mode 100644 index 0000000000..d9281eaeae --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strrchr.S @@ -0,0 +1,57 @@ +/* Multiple versions of strrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(strrchr) + .type strrchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strrchr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf) + HAS_ARCH_FEATURE (Slow_BSF) + jz 2f + LOAD_FUNC_GOT_EAX (__strrchr_sse2) +2: ret +END(strrchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __strrchr_ia32, @function; \ + .globl __strrchr_ia32; \ + .p2align 4; \ + __strrchr_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strrchr_ia32, .-__strrchr_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strrchr; __GI_strrchr = __strrchr_ia32 +#endif + +#include "../../strrchr.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c new file mode 100644 index 0000000000..bea09dea71 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn-c.c @@ -0,0 +1,2 @@ +#define __strspn_sse2 __strspn_ia32 +#include <sysdeps/x86_64/multiarch/strspn-c.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S new file mode 100644 index 0000000000..1269062381 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/strspn.S @@ -0,0 +1,56 @@ +/* Multiple versions of strspn + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2009-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <config.h> +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(strspn) + .type strspn, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__strspn_ia32) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (__strspn_sse42) +2: ret +END(strspn) + +# undef ENTRY +# define ENTRY(name) \ + .type __strspn_ia32, @function; \ + .globl __strspn_ia32; \ + .p2align 4; \ +__strspn_ia32: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strspn_ia32, .-__strspn_ia32 +# undef libc_hidden_builtin_def +/* IFUNC doesn't work with the hidden functions in shared library since + they will be called without setting up EBX needed for PLT which is + used by IFUNC. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strspn; __GI_strspn = __strspn_ia32 +#endif + +#include "../../strspn.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c new file mode 100644 index 0000000000..593cfec273 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/test-multiarch.c @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/test-multiarch.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c new file mode 100644 index 0000000000..7760b966e2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.c @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/varshift.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h new file mode 100644 index 0000000000..7c72c70d67 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/varshift.h @@ -0,0 +1 @@ +#include <sysdeps/x86_64/multiarch/varshift.h> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c new file mode 100644 index 0000000000..38d41d04de --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-c.c @@ -0,0 +1,22 @@ +#include <wchar.h> + +#if IS_IN (libc) +# undef libc_hidden_weak +# define libc_hidden_weak(name) + +# undef weak_alias +# define weak_alias(name,alias) + +# ifdef SHARED +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__wcschr_ia32, __GI_wcschr, __wcschr_ia32); \ + strong_alias (__wcschr_ia32, __wcschr_ia32_1); \ + __hidden_ver1 (__wcschr_ia32_1, __GI___wcschr, __wcschr_ia32_1); +# endif +#endif + +extern __typeof (wcschr) __wcschr_ia32; + +#define WCSCHR __wcschr_ia32 +#include <wcsmbs/wcschr.c> diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S new file mode 100644 index 0000000000..9ff6c3b8d6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr-sse2.S @@ -0,0 +1,219 @@ +/* wcschr with SSE2, without using bsf instructions + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcschr_sse2) + + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %eax + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + + and $63, %eax + cmp $48, %eax + ja L(cross_cache) + + movdqu (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + and $-16, %ecx + jmp L(loop) + + .p2align 4 +L(cross_cache): + PUSH (%edi) + mov %ecx, %edi + mov %eax, %ecx + and $-16, %edi + and $15, %ecx + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + + sarl %cl, %edx + sarl %cl, %eax + test %eax, %eax + jz L(unaligned_no_match) + + add %edi, %ecx + POP (%edi) + + test %edx, %edx + jz L(match_case1) + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + CFI_PUSH (%edi) + + .p2align 4 +L(unaligned_no_match): + mov %edi, %ecx + POP (%edi) + + test %edx, %edx + jnz L(return_null) + + pxor %xmm2, %xmm2 + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + add $16, %ecx + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jnz L(matches) + add $16, %ecx + + movdqa (%ecx), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %edx + pmovmskb %xmm0, %eax + or %eax, %edx + jz L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %edx + test %eax, %eax + jz L(return_null) + test %edx, %edx + jz L(match_case1) + + .p2align 4 +L(match_case2): + test %al, %al + jz L(match_higth_case2) + test $15, %al + jnz L(match_case2_4) + test $15, %dl + jnz L(return_null) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_case2_4): + mov %ecx, %eax + ret + + .p2align 4 +L(match_higth_case2): + test %dl, %dl + jnz L(return_null) + test $15, %ah + jnz L(match_case2_12) + test $15, %dh + jnz L(return_null) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(match_case2_12): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(match_case1): + test %al, %al + jz L(match_higth_case1) + + test $0x01, %al + jnz L(exit0) + lea 4(%ecx), %eax + ret + + .p2align 4 +L(match_higth_case1): + test $0x01, %ah + jnz L(exit3) + lea 12(%ecx), %eax + ret + + .p2align 4 +L(exit0): + mov %ecx, %eax + ret + + .p2align 4 +L(exit3): + lea 8(%ecx), %eax + ret + + .p2align 4 +L(return_null): + xor %eax, %eax + ret + +END (__wcschr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S new file mode 100644 index 0000000000..d3c65a6436 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcschr.S @@ -0,0 +1,36 @@ +/* Multiple versions of wcschr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__wcschr) + .type wcschr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcschr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcschr_sse2) +2: ret +END(__wcschr) +weak_alias (__wcschr, wcschr) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c new file mode 100644 index 0000000000..e3337d77e2 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-c.c @@ -0,0 +1,14 @@ +#include <wchar.h> + +#define WCSCMP __wcscmp_ia32 +#ifdef SHARED +# undef libc_hidden_def +# define libc_hidden_def(name) \ + __hidden_ver1 (__wcscmp_ia32, __GI___wcscmp, __wcscmp_ia32); +#endif +#undef weak_alias +#define weak_alias(name, alias) + +extern __typeof (wcscmp) __wcscmp_ia32; + +#include "wcsmbs/wcscmp.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S new file mode 100644 index 0000000000..a464b58204 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp-sse2.S @@ -0,0 +1,1018 @@ +/* wcscmp with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define ENTRANCE PUSH(%esi); PUSH(%edi) +# define RETURN POP(%edi); POP(%esi); ret; CFI_PUSH(%esi); CFI_PUSH(%edi); +# define PARMS 4 +# define STR1 PARMS +# define STR2 STR1+4 + +/* Note: wcscmp uses signed comparison, not unsugned as in strcmp function. */ + + .text +ENTRY (__wcscmp_sse2) +/* + * This implementation uses SSE to compare up to 16 bytes at a time. +*/ + mov STR1(%esp), %edx + mov STR2(%esp), %eax + + mov (%eax), %ecx + cmp %ecx, (%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + mov 4(%eax), %ecx + cmp %ecx, 4(%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + mov 8(%eax), %ecx + cmp %ecx, 8(%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + mov 12(%eax), %ecx + cmp %ecx, 12(%edx) + jne L(neq) + test %ecx, %ecx + jz L(eq) + + ENTRANCE + add $16, %eax + add $16, %edx + + mov %eax, %esi + mov %edx, %edi + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + mov %al, %ch + mov %dl, %cl + and $63, %eax /* esi alignment in cache line */ + and $63, %edx /* edi alignment in cache line */ + and $15, %cl + jz L(continue_00) + cmp $16, %edx + jb L(continue_0) + cmp $32, %edx + jb L(continue_16) + cmp $48, %edx + jb L(continue_32) + +L(continue_48): + and $15, %ch + jz L(continue_48_00) + cmp $16, %eax + jb L(continue_0_48) + cmp $32, %eax + jb L(continue_16_48) + cmp $48, %eax + jb L(continue_32_48) + + .p2align 4 +L(continue_48_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%edi), %xmm1 + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_48_48) + +L(continue_0): + and $15, %ch + jz L(continue_0_00) + cmp $16, %eax + jb L(continue_0_0) + cmp $32, %eax + jb L(continue_0_16) + cmp $48, %eax + jb L(continue_0_32) + + .p2align 4 +L(continue_0_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + mov 48(%esi), %ecx + cmp %ecx, 48(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 52(%esi), %ecx + cmp %ecx, 52(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 56(%esi), %ecx + cmp %ecx, 56(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 60(%esi), %ecx + cmp %ecx, 60(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + add $64, %esi + add $64, %edi + jmp L(continue_0_48) + + .p2align 4 +L(continue_00): + and $15, %ch + jz L(continue_00_00) + cmp $16, %eax + jb L(continue_00_0) + cmp $32, %eax + jb L(continue_00_16) + cmp $48, %eax + jb L(continue_00_32) + + .p2align 4 +L(continue_00_48): + pcmpeqd (%edi), %xmm0 + mov (%edi), %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(less4_double_words1) + + cmp (%esi), %eax + jne L(nequal) + + mov 4(%edi), %eax + cmp 4(%esi), %eax + jne L(nequal) + + mov 8(%edi), %eax + cmp 8(%esi), %eax + jne L(nequal) + + mov 12(%edi), %eax + cmp 12(%esi), %eax + jne L(nequal) + + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_32): + and $15, %ch + jz L(continue_32_00) + cmp $16, %eax + jb L(continue_0_32) + cmp $32, %eax + jb L(continue_16_32) + cmp $48, %eax + jb L(continue_32_32) + + .p2align 4 +L(continue_32_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 16(%esi), %ecx + cmp %ecx, 16(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 20(%esi), %ecx + cmp %ecx, 20(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 24(%esi), %ecx + cmp %ecx, 24(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 28(%esi), %ecx + cmp %ecx, 28(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%edi), %xmm1 + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results */ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_32_48) + + .p2align 4 +L(continue_16): + and $15, %ch + jz L(continue_16_00) + cmp $16, %eax + jb L(continue_0_16) + cmp $32, %eax + jb L(continue_16_16) + cmp $48, %eax + jb L(continue_16_32) + + .p2align 4 +L(continue_16_48): + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + mov 32(%esi), %ecx + cmp %ecx, 32(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 36(%esi), %ecx + cmp %ecx, 36(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 40(%esi), %ecx + cmp %ecx, 40(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 44(%esi), %ecx + cmp %ecx, 44(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + movdqu 48(%edi), %xmm1 + movdqu 48(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_16_48) + + .p2align 4 +L(continue_00_00): + movdqa (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqa 16(%edi), %xmm3 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqa 32(%edi), %xmm5 + pcmpeqd %xmm5, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%esi), %xmm5 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm5 /* packed sub of comparison results*/ + pmovmskb %xmm5, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqa 48(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_00_00) + + .p2align 4 +L(continue_00_32): + movdqu (%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_00_16): + movdqu (%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_00_0): + movdqu (%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd (%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%edi), %xmm2 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %esi + add $48, %edi + jmp L(continue_00_48) + + .p2align 4 +L(continue_48_00): + pcmpeqd (%esi), %xmm0 + mov (%edi), %eax + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jnz L(less4_double_words1) + + cmp (%esi), %eax + jne L(nequal) + + mov 4(%edi), %eax + cmp 4(%esi), %eax + jne L(nequal) + + mov 8(%edi), %eax + cmp 8(%esi), %eax + jne L(nequal) + + mov 12(%edi), %eax + cmp 12(%esi), %eax + jne L(nequal) + + movdqu 16(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + movdqu 48(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 48(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_48) + + add $64, %esi + add $64, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_32_00): + movdqu (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_16_00): + movdqu (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_0_00): + movdqu (%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd (%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 16(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd 32(%esi), %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %esi + add $48, %edi + jmp L(continue_48_00) + + .p2align 4 +L(continue_32_32): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_48_48) + + .p2align 4 +L(continue_16_16): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm3 + movdqu 16(%esi), %xmm4 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_48_48) + + .p2align 4 +L(continue_0_0): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm3 + movdqu 16(%esi), %xmm4 + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm3 /* packed sub of comparison results*/ + pmovmskb %xmm3, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + movdqu 32(%edi), %xmm1 + movdqu 32(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_32) + + add $48, %esi + add $48, %edi + jmp L(continue_48_48) + + .p2align 4 +L(continue_0_16): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + movdqu 16(%edi), %xmm1 + movdqu 16(%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words_16) + + add $32, %esi + add $32, %edi + jmp L(continue_32_48) + + .p2align 4 +L(continue_0_32): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_16_48) + + .p2align 4 +L(continue_16_32): + movdqu (%edi), %xmm1 + movdqu (%esi), %xmm2 + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */ + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */ + jnz L(less4_double_words) + + add $16, %esi + add $16, %edi + jmp L(continue_32_48) + + .p2align 4 +L(less4_double_words1): + cmp (%esi), %eax + jne L(nequal) + test %eax, %eax + jz L(equal) + + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + test %ecx, %ecx + jz L(equal) + + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + xor %eax, %eax + RETURN + + .p2align 4 +L(less4_double_words): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words) + and $15, %dl + jz L(second_double_word) + mov (%esi), %ecx + cmp %ecx, (%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word): + mov 4(%esi), %ecx + cmp %ecx, 4(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words): + and $15, %dh + jz L(fourth_double_word) + mov 8(%esi), %ecx + cmp %ecx, 8(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word): + mov 12(%esi), %ecx + cmp %ecx, 12(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(less4_double_words_16): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_16) + and $15, %dl + jz L(second_double_word_16) + mov 16(%esi), %ecx + cmp %ecx, 16(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word_16): + mov 20(%esi), %ecx + cmp %ecx, 20(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words_16): + and $15, %dh + jz L(fourth_double_word_16) + mov 24(%esi), %ecx + cmp %ecx, 24(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word_16): + mov 28(%esi), %ecx + cmp %ecx, 28(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(less4_double_words_32): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_32) + and $15, %dl + jz L(second_double_word_32) + mov 32(%esi), %ecx + cmp %ecx, 32(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word_32): + mov 36(%esi), %ecx + cmp %ecx, 36(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words_32): + and $15, %dh + jz L(fourth_double_word_32) + mov 40(%esi), %ecx + cmp %ecx, 40(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word_32): + mov 44(%esi), %ecx + cmp %ecx, 44(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(less4_double_words_48): + xor %eax, %eax + test %dl, %dl + jz L(next_two_double_words_48) + and $15, %dl + jz L(second_double_word_48) + mov 48(%esi), %ecx + cmp %ecx, 48(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(second_double_word_48): + mov 52(%esi), %ecx + cmp %ecx, 52(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(next_two_double_words_48): + and $15, %dh + jz L(fourth_double_word_48) + mov 56(%esi), %ecx + cmp %ecx, 56(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(fourth_double_word_48): + mov 60(%esi), %ecx + cmp %ecx, 60(%edi) + jne L(nequal) + RETURN + + .p2align 4 +L(nequal): + mov $1, %eax + jg L(return) + neg %eax + RETURN + + .p2align 4 +L(return): + RETURN + + .p2align 4 +L(equal): + xorl %eax, %eax + RETURN + + CFI_POP (%edi) + CFI_POP (%esi) + + .p2align 4 +L(neq): + mov $1, %eax + jg L(neq_bigger) + neg %eax + +L(neq_bigger): + ret + + .p2align 4 +L(eq): + xorl %eax, %eax + ret + +END (__wcscmp_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S new file mode 100644 index 0000000000..7118bdd4db --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscmp.S @@ -0,0 +1,39 @@ +/* Multiple versions of wcscmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc and for the + DSO. In static binaries, we need wcscmp before the initialization + happened. */ +#if IS_IN (libc) + .text +ENTRY(__wcscmp) + .type __wcscmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcscmp_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcscmp_sse2) +2: ret +END(__wcscmp) +weak_alias (__wcscmp, wcscmp) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c new file mode 100644 index 0000000000..fb3000392b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-c.c @@ -0,0 +1,5 @@ +#if IS_IN (libc) +# define wcscpy __wcscpy_ia32 +#endif + +#include "wcsmbs/wcscpy.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S new file mode 100644 index 0000000000..6280ba92ab --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy-ssse3.S @@ -0,0 +1,600 @@ +/* wcscpy with SSSE3 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 4 +# define RETURN POP (%edi); ret; CFI_PUSH (%edi) +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + + atom_text_section +ENTRY (__wcscpy_ssse3) + mov STR1(%esp), %edx + mov STR2(%esp), %ecx + + cmp $0, (%ecx) + jz L(ExitTail4) + cmp $0, 4(%ecx) + jz L(ExitTail8) + cmp $0, 8(%ecx) + jz L(ExitTail12) + cmp $0, 12(%ecx) + jz L(ExitTail16) + + PUSH (%edi) + mov %edx, %edi + PUSH (%esi) + lea 16(%ecx), %esi + + and $-16, %esi + + pxor %xmm0, %xmm0 + pcmpeqd (%esi), %xmm0 + movdqu (%ecx), %xmm1 + movdqu %xmm1, (%edx) + + pmovmskb %xmm0, %eax + sub %ecx, %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + mov %edx, %eax + lea 16(%edx), %edx + and $-16, %edx + sub %edx, %eax + + sub %eax, %ecx + mov %ecx, %eax + and $0xf, %eax + mov $0, %esi + + jz L(Align16Both) + cmp $4, %eax + je L(Shl4) + cmp $8, %eax + je L(Shl8) + jmp L(Shl12) + +L(Align16Both): + movaps (%ecx), %xmm1 + movaps 16(%ecx), %xmm2 + movaps %xmm1, (%edx) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm4 + movaps %xmm3, (%edx, %esi) + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm1 + movaps %xmm4, (%edx, %esi) + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm2 + movaps %xmm1, (%edx, %esi) + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps 16(%ecx, %esi), %xmm3 + movaps %xmm2, (%edx, %esi) + pcmpeqd %xmm3, %xmm0 + pmovmskb %xmm0, %eax + lea 16(%esi), %esi + + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + movaps %xmm3, (%edx, %esi) + mov %ecx, %eax + lea 16(%ecx, %esi), %ecx + and $-0x40, %ecx + sub %ecx, %eax + sub %eax, %edx + + mov $-0x40, %esi + +L(Aligned64Loop): + movaps (%ecx), %xmm2 + movaps 32(%ecx), %xmm3 + movaps %xmm2, %xmm4 + movaps 16(%ecx), %xmm5 + movaps %xmm3, %xmm6 + movaps 48(%ecx), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + lea 64(%edx), %edx + pcmpeqd %xmm0, %xmm3 + lea 64(%ecx), %ecx + pmovmskb %xmm3, %eax + + test %eax, %eax + jnz L(Aligned64Leave) + movaps %xmm4, -64(%edx) + movaps %xmm5, -48(%edx) + movaps %xmm6, -32(%edx) + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + +L(Aligned64Leave): + pcmpeqd %xmm4, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm5, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm4, -64(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + pcmpeqd %xmm6, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm5, -48(%edx) + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + movaps %xmm6, -32(%edx) + pcmpeqd %xmm7, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + lea 16(%esi), %esi + jnz L(CopyFrom1To16Bytes) + + mov $-0x40, %esi + movaps %xmm7, -16(%edx) + jmp L(Aligned64Loop) + + .p2align 4 +L(Shl4): + movaps -4(%ecx), %xmm1 + movaps 12(%ecx), %xmm2 +L(Shl4Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm1 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 28(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + + test %eax, %eax + jnz L(Shl4LoopExit) + + palignr $4, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 28(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -12(%ecx), %ecx + sub %eax, %edx + + movaps -4(%ecx), %xmm1 + +L(Shl4LoopStart): + movaps 12(%ecx), %xmm2 + movaps 28(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 44(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 60(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + test %eax, %eax + palignr $4, %xmm3, %xmm4 + jnz L(Shl4Start) + + palignr $4, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $4, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl4LoopStart) + +L(Shl4LoopExit): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) + POP (%esi) + add $12, %edx + add $12, %ecx + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(Shl8): + movaps -8(%ecx), %xmm1 + movaps 8(%ecx), %xmm2 +L(Shl8Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm1 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 24(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + + test %eax, %eax + jnz L(Shl8LoopExit) + + palignr $8, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 24(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -8(%ecx), %ecx + sub %eax, %edx + + movaps -8(%ecx), %xmm1 + +L(Shl8LoopStart): + movaps 8(%ecx), %xmm2 + movaps 24(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 40(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 56(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $8, %xmm4, %xmm5 + test %eax, %eax + palignr $8, %xmm3, %xmm4 + jnz L(Shl8Start) + + palignr $8, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $8, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl8LoopStart) + +L(Shl8LoopExit): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + POP (%esi) + add $8, %edx + add $8, %ecx + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(Shl12): + movaps -12(%ecx), %xmm1 + movaps 4(%ecx), %xmm2 +L(Shl12Start): + pcmpeqd %xmm2, %xmm0 + pmovmskb %xmm0, %eax + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm1 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + movaps %xmm2, %xmm3 + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps 20(%ecx), %xmm2 + + pcmpeqd %xmm2, %xmm0 + lea 16(%edx), %edx + pmovmskb %xmm0, %eax + lea 16(%ecx), %ecx + + test %eax, %eax + jnz L(Shl12LoopExit) + + palignr $12, %xmm3, %xmm2 + movaps %xmm2, (%edx) + lea 20(%ecx), %ecx + lea 16(%edx), %edx + + mov %ecx, %eax + and $-0x40, %ecx + sub %ecx, %eax + lea -4(%ecx), %ecx + sub %eax, %edx + + movaps -12(%ecx), %xmm1 + +L(Shl12LoopStart): + movaps 4(%ecx), %xmm2 + movaps 20(%ecx), %xmm3 + movaps %xmm3, %xmm6 + movaps 36(%ecx), %xmm4 + movaps %xmm4, %xmm7 + movaps 52(%ecx), %xmm5 + pminub %xmm2, %xmm6 + pminub %xmm5, %xmm7 + pminub %xmm6, %xmm7 + pcmpeqd %xmm0, %xmm7 + pmovmskb %xmm7, %eax + movaps %xmm5, %xmm7 + palignr $12, %xmm4, %xmm5 + test %eax, %eax + palignr $12, %xmm3, %xmm4 + jnz L(Shl12Start) + + palignr $12, %xmm2, %xmm3 + lea 64(%ecx), %ecx + palignr $12, %xmm1, %xmm2 + movaps %xmm7, %xmm1 + movaps %xmm5, 48(%edx) + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm2, (%edx) + lea 64(%edx), %edx + jmp L(Shl12LoopStart) + +L(Shl12LoopExit): + movl (%ecx), %esi + movl %esi, (%edx) + mov $4, %esi + + .p2align 4 +L(CopyFrom1To16Bytes): + add %esi, %edx + add %esi, %ecx + + POP (%esi) + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit4) +L(Exit8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit12) +L(Exit16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edi, %eax + RETURN + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edi, %eax + RETURN + +CFI_POP (%edi) + + .p2align 4 +L(ExitTail4): + movl (%ecx), %eax + movl %eax, (%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail8): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail12): + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movl 8(%ecx), %eax + movl %eax, 8(%edx) + movl %edx, %eax + ret + + .p2align 4 +L(ExitTail16): + movdqu (%ecx), %xmm0 + movdqu %xmm0, (%edx) + movl %edx, %eax + ret + +END (__wcscpy_ssse3) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S new file mode 100644 index 0000000000..cfc97dd87c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcscpy.S @@ -0,0 +1,36 @@ +/* Multiple versions of wcscpy + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(wcscpy) + .type wcscpy, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcscpy_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__wcscpy_ssse3) +2: ret +END(wcscpy) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c new file mode 100644 index 0000000000..a335dc0f7e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-c.c @@ -0,0 +1,9 @@ +#include <wchar.h> + +#if IS_IN (libc) +# define WCSLEN __wcslen_ia32 +#endif + +extern __typeof (wcslen) __wcslen_ia32; + +#include "wcsmbs/wcslen.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S new file mode 100644 index 0000000000..bd3fc4c79b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen-sse2.S @@ -0,0 +1,193 @@ +/* wcslen with SSE2 + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> +# define STR 4 + + .text +ENTRY (__wcslen_sse2) + mov STR(%esp), %edx + + cmp $0, (%edx) + jz L(exit_tail0) + cmp $0, 4(%edx) + jz L(exit_tail1) + cmp $0, 8(%edx) + jz L(exit_tail2) + cmp $0, 12(%edx) + jz L(exit_tail3) + cmp $0, 16(%edx) + jz L(exit_tail4) + cmp $0, 20(%edx) + jz L(exit_tail5) + cmp $0, 24(%edx) + jz L(exit_tail6) + cmp $0, 28(%edx) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%edx), %eax + lea 16(%edx), %ecx + and $-16, %eax + + pcmpeqd (%eax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + pcmpeqd (%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%eax), %eax + jnz L(exit) + + and $-0x40, %eax + + .p2align 4 +L(aligned_64_loop): + movaps (%eax), %xmm0 + movaps 16(%eax), %xmm1 + movaps 32(%eax), %xmm2 + movaps 48(%eax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 64(%eax), %eax + jz L(aligned_64_loop) + + pcmpeqd -64(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 48(%ecx), %ecx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqd -32(%eax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea -16(%ecx), %ecx + jnz L(exit) + + jmp L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %ecx, %eax + shr $2, %eax + test %dl, %dl + jz L(exit_high) + + mov %dl, %cl + and $15, %cl + jz L(exit_1) + ret + + .p2align 4 +L(exit_high): + mov %dh, %ch + and $15, %ch + jz L(exit_3) + add $2, %eax + ret + + .p2align 4 +L(exit_1): + add $1, %eax + ret + + .p2align 4 +L(exit_3): + add $3, %eax + ret + + .p2align 4 +L(exit_tail0): + xor %eax, %eax + ret + + .p2align 4 +L(exit_tail1): + mov $1, %eax + ret + + .p2align 4 +L(exit_tail2): + mov $2, %eax + ret + + .p2align 4 +L(exit_tail3): + mov $3, %eax + ret + + .p2align 4 +L(exit_tail4): + mov $4, %eax + ret + + .p2align 4 +L(exit_tail5): + mov $5, %eax + ret + + .p2align 4 +L(exit_tail6): + mov $6, %eax + ret + + .p2align 4 +L(exit_tail7): + mov $7, %eax + ret + +END (__wcslen_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S new file mode 100644 index 0000000000..6ef9b6e7b5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcslen.S @@ -0,0 +1,37 @@ +/* Multiple versions of wcslen + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(__wcslen) + .type __wcslen, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcslen_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcslen_sse2) +2: ret +END(__wcslen) + +weak_alias(__wcslen, wcslen) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c new file mode 100644 index 0000000000..8d8a335b5b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-c.c @@ -0,0 +1,5 @@ +#if IS_IN (libc) +# define wcsrchr __wcsrchr_ia32 +#endif + +#include "wcsmbs/wcsrchr.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S new file mode 100644 index 0000000000..1a9b60e55e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr-sse2.S @@ -0,0 +1,354 @@ +/* wcsrchr with SSE2, without using bsf instructions. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# include <sysdep.h> +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# define PARMS 8 +# define ENTRANCE PUSH (%edi); +# define RETURN POP (%edi); ret; CFI_PUSH (%edi); +# define STR1 PARMS +# define STR2 STR1+4 + + atom_text_section +ENTRY (__wcsrchr_sse2) + + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + + mov %ecx, %edi + punpckldq %xmm1, %xmm1 + pxor %xmm2, %xmm2 + punpckldq %xmm1, %xmm1 + +/* ECX has OFFSET. */ + and $63, %ecx + cmp $48, %ecx + ja L(crosscache) + +/* unaligned string. */ + movdqu (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 +/* Find where NULL is. */ + pmovmskb %xmm2, %ecx +/* Check if there is a match. */ + pmovmskb %xmm0, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match1) + + test %ecx, %ecx + jnz L(return_null) + + and $-16, %edi + + PUSH (%esi) + + xor %edx, %edx + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(unaligned_match1): + test %ecx, %ecx + jnz L(prolog_find_zero_1) + + PUSH (%esi) + +/* Save current match */ + mov %eax, %edx + mov %edi, %esi + and $-16, %edi + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(crosscache): +/* Hancle unaligned string. */ + and $15, %ecx + and $-16, %edi + pxor %xmm3, %xmm3 + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm3 + pcmpeqd %xmm1, %xmm0 +/* Find where NULL is. */ + pmovmskb %xmm3, %edx +/* Check if there is a match. */ + pmovmskb %xmm0, %eax +/* Remove the leading bytes. */ + shr %cl, %edx + shr %cl, %eax + add $16, %edi + + test %eax, %eax + jnz L(unaligned_match) + + test %edx, %edx + jnz L(return_null) + + PUSH (%esi) + + xor %edx, %edx + jmp L(loop) + + CFI_POP (%esi) + + .p2align 4 +L(unaligned_match): + test %edx, %edx + jnz L(prolog_find_zero) + + PUSH (%esi) + + mov %eax, %edx + lea (%edi, %ecx), %esi + +/* Loop start on aligned string. */ + .p2align 4 +L(loop): + movdqa (%edi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %ecx + pmovmskb %xmm0, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm3 + pcmpeqd %xmm3, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm2, %ecx + pmovmskb %xmm3, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm4 + pcmpeqd %xmm4, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm4 + pmovmskb %xmm2, %ecx + pmovmskb %xmm4, %eax + or %eax, %ecx + jnz L(matches) + + movdqa (%edi), %xmm5 + pcmpeqd %xmm5, %xmm2 + add $16, %edi + pcmpeqd %xmm1, %xmm5 + pmovmskb %xmm2, %ecx + pmovmskb %xmm5, %eax + or %eax, %ecx + jz L(loop) + + .p2align 4 +L(matches): + test %eax, %eax + jnz L(match) +L(return_value): + test %edx, %edx + jz L(return_null_1) + mov %edx, %eax + mov %esi, %edi + + POP (%esi) + + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(return_null_1): + POP (%esi) + + xor %eax, %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(match): + pmovmskb %xmm2, %ecx + test %ecx, %ecx + jnz L(find_zero) +/* save match info */ + mov %eax, %edx + mov %edi, %esi + jmp L(loop) + + .p2align 4 +L(find_zero): + test %cl, %cl + jz L(find_zero_in_third_or_fourth_wchar) + test $15, %cl + jz L(find_zero_in_second_wchar) + and $1, %eax + jz L(return_value) + + POP (%esi) + + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_second_wchar): + and $1 << 5 - 1, %eax + jz L(return_value) + + POP (%esi) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_third_or_fourth_wchar): + test $15, %ch + jz L(find_zero_in_fourth_wchar) + and $1 << 9 - 1, %eax + jz L(return_value) + + POP (%esi) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(find_zero_in_fourth_wchar): + + POP (%esi) + + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + CFI_PUSH (%esi) + + .p2align 4 +L(match_second_wchar): + lea -12(%edi), %eax + RETURN + + .p2align 4 +L(match_third_or_fourth_wchar): + test $15 << 4, %ah + jnz L(match_fourth_wchar) + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_third_wchar): + lea -8(%edi), %eax + RETURN + + .p2align 4 +L(match_fourth_wchar): + lea -4(%edi), %eax + RETURN + + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN + + .p2align 4 +L(prolog_find_zero): + add %ecx, %edi + mov %edx, %ecx +L(prolog_find_zero_1): + test %cl, %cl + jz L(prolog_find_zero_in_third_or_fourth_wchar) + test $15, %cl + jz L(prolog_find_zero_in_second_wchar) + and $1, %eax + jz L(return_null) + + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_second_wchar): + and $1 << 5 - 1, %eax + jz L(return_null) + + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_third_or_fourth_wchar): + test $15, %ch + jz L(prolog_find_zero_in_fourth_wchar) + and $1 << 9 - 1, %eax + jz L(return_null) + + test %ah, %ah + jnz L(match_third_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + + .p2align 4 +L(prolog_find_zero_in_fourth_wchar): + test %ah, %ah + jnz L(match_third_or_fourth_wchar) + test $15 << 4, %al + jnz L(match_second_wchar) + lea -16(%edi), %eax + RETURN + +END (__wcsrchr_sse2) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S new file mode 100644 index 0000000000..cf67333995 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wcsrchr.S @@ -0,0 +1,35 @@ +/* Multiple versions of wcsrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +#if IS_IN (libc) + .text +ENTRY(wcsrchr) + .type wcsrchr, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wcsrchr_ia32) + HAS_CPU_FEATURE (SSE2) + jz 2f + LOAD_FUNC_GOT_EAX (__wcsrchr_sse2) +2: ret +END(wcsrchr) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c new file mode 100644 index 0000000000..75ab4b94c1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-c.c @@ -0,0 +1,9 @@ +#include <wchar.h> + +#if IS_IN (libc) +# define WMEMCMP __wmemcmp_ia32 +#endif + +extern __typeof (wmemcmp) __wmemcmp_ia32; + +#include "wcsmbs/wmemcmp.c" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S new file mode 100644 index 0000000000..1a857c7e21 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-sse4.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_sse4_2 + +#include "memcmp-sse4.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S new file mode 100644 index 0000000000..a41ef95fc1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_WMEMCMP 1 +#define MEMCMP __wmemcmp_ssse3 + +#include "memcmp-ssse3.S" diff --git a/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S new file mode 100644 index 0000000000..1b9a54a413 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/multiarch/wmemcmp.S @@ -0,0 +1,40 @@ +/* Multiple versions of wmemcmp + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2011-2017 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) + .text +ENTRY(wmemcmp) + .type wmemcmp, @gnu_indirect_function + LOAD_GOT_AND_RTLD_GLOBAL_RO + LOAD_FUNC_GOT_EAX (__wmemcmp_ia32) + HAS_CPU_FEATURE (SSSE3) + jz 2f + LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3) + HAS_CPU_FEATURE (SSE4_2) + jz 2f + LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2) +2: ret +END(wmemcmp) +#endif diff --git a/REORG.TODO/sysdeps/i386/i686/nptl/tls.h b/REORG.TODO/sysdeps/i386/i686/nptl/tls.h new file mode 100644 index 0000000000..5b527af9d3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/nptl/tls.h @@ -0,0 +1,35 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _TLS_H + +/* Additional definitions for <tls.h> on i686 and up. */ + + +/* Macros to load from and store into segment registers. We can use + the 32-bit instructions. */ +#define TLS_GET_GS() \ + ({ int __seg; __asm ("movl %%gs, %0" : "=q" (__seg)); __seg; }) +#define TLS_SET_GS(val) \ + __asm ("movl %0, %%gs" :: "q" (val)) + + +/* Get the full set of definitions. */ +#include_next <tls.h> + +#endif /* tls.h */ diff --git a/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S b/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S new file mode 100644 index 0000000000..ce9c94d41a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/pthread_spin_trylock.S @@ -0,0 +1,20 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define HAVE_CMOV 1 +#include <sysdeps/i386/pthread_spin_trylock.S> diff --git a/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h b/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h new file mode 100644 index 0000000000..9b5a1b0d47 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/stack-aliasing.h @@ -0,0 +1,23 @@ +/* Define macros for stack address aliasing issues for NPTL. i686 version. + Copyright (C) 2014-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* What is useful is to avoid the 64k aliasing problem which reliably + happens if all stacks use sizes which are a multiple of 64k. Tell + the stack allocator to disturb this by allocation one more page if + necessary. */ +#define MULTI_PAGE_ALIASING 65536 diff --git a/REORG.TODO/sysdeps/i386/i686/strcmp.S b/REORG.TODO/sysdeps/i386/i686/strcmp.S new file mode 100644 index 0000000000..1ae305912e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/strcmp.S @@ -0,0 +1,52 @@ +/* Highly optimized version for ix86, x>=6. + Copyright (C) 1999-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4 /* no space for saved regs */ +#define STR1 PARMS +#define STR2 STR1+4 + + .text +ENTRY (strcmp) + + movl STR1(%esp), %ecx + movl STR2(%esp), %edx + +L(oop): movb (%ecx), %al + cmpb (%edx), %al + jne L(neq) + incl %ecx + incl %edx + testb %al, %al + jnz L(oop) + + xorl %eax, %eax + /* when strings are equal, pointers rest one beyond + the end of the NUL terminators. */ + ret + +L(neq): movl $1, %eax + movl $-1, %ecx + cmovbl %ecx, %eax + + ret +END (strcmp) +libc_hidden_builtin_def (strcmp) diff --git a/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h b/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h new file mode 100644 index 0000000000..51f03fe77b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i686/tst-stack-align.h @@ -0,0 +1,44 @@ +/* Copyright (C) 2003-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdio.h> +#include <stdint.h> +#ifndef __SSE__ +#include_next <tst-stack-align.h> +#else +#include <xmmintrin.h> + +#define TEST_STACK_ALIGN() \ + ({ \ + __m128 _m; \ + double _d = 12.0; \ + long double _ld = 15.0; \ + int _ret = 0; \ + printf ("__m128: %p %zu\n", &_m, __alignof (__m128)); \ + if ((((uintptr_t) &_m) & (__alignof (__m128) - 1)) != 0) \ + _ret = 1; \ + \ + printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \ + if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \ + _ret = 1; \ + \ + printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \ + if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \ + _ret = 1; \ + _ret; \ + }) +#endif diff --git a/REORG.TODO/sysdeps/i386/i786/Implies b/REORG.TODO/sysdeps/i386/i786/Implies new file mode 100644 index 0000000000..1cd29f63cf --- /dev/null +++ b/REORG.TODO/sysdeps/i386/i786/Implies @@ -0,0 +1,2 @@ +# The PPro and PII cores are mostly the same. +i386/i686 diff --git a/REORG.TODO/sysdeps/i386/init-arch.h b/REORG.TODO/sysdeps/i386/init-arch.h new file mode 100644 index 0000000000..72881c5679 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/init-arch.h @@ -0,0 +1,19 @@ +/* Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define MINIMUM_ISA 486 +#include <sysdeps/x86/init-arch.h> diff --git a/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h b/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h new file mode 100644 index 0000000000..1c95db7287 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/jmpbuf-offsets.h @@ -0,0 +1,25 @@ +/* Private macros for accessing __jmp_buf contents. i386 version. + Copyright (C) 2006-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define JB_BX 0 +#define JB_SI 1 +#define JB_DI 2 +#define JB_BP 3 +#define JB_SP 4 +#define JB_PC 5 +#define JB_SIZE 24 diff --git a/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h b/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h new file mode 100644 index 0000000000..0a63a832cc --- /dev/null +++ b/REORG.TODO/sysdeps/i386/jmpbuf-unwind.h @@ -0,0 +1,47 @@ +/* Copyright (C) 2003-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Jakub Jelinek <jakub@redhat.com>, 2003. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <setjmp.h> +#include <jmpbuf-offsets.h> +#include <stdint.h> +#include <unwind.h> +#include <sysdep.h> + +/* Test if longjmp to JMPBUF would unwind the frame + containing a local variable at ADDRESS. */ +#define _JMPBUF_UNWINDS(jmpbuf, address, demangle) \ + ((void *) (address) < (void *) demangle ((jmpbuf)[JB_SP])) + +#define _JMPBUF_CFA_UNWINDS_ADJ(_jmpbuf, _context, _adj) \ + _JMPBUF_UNWINDS_ADJ (_jmpbuf, (void *) _Unwind_GetCFA (_context), _adj) + +static inline uintptr_t __attribute__ ((unused)) +_jmpbuf_sp (__jmp_buf regs) +{ + uintptr_t sp = regs[JB_SP]; +#ifdef PTR_DEMANGLE + PTR_DEMANGLE (sp); +#endif + return sp; +} + +#define _JMPBUF_UNWINDS_ADJ(_jmpbuf, _address, _adj) \ + ((uintptr_t) (_address) - (_adj) < _jmpbuf_sp (_jmpbuf) - (_adj)) + +/* We use the normal longjmp for unwinding. */ +#define __libc_unwind_longjmp(buf, val) __libc_longjmp (buf, val) diff --git a/REORG.TODO/sysdeps/i386/ldbl2mpn.c b/REORG.TODO/sysdeps/i386/ldbl2mpn.c new file mode 100644 index 0000000000..076be0ae7e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/ldbl2mpn.c @@ -0,0 +1,120 @@ +/* Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "gmp.h" +#include "gmp-impl.h" +#include "longlong.h" +#include <ieee754.h> +#include <float.h> +#include <stdlib.h> + +/* Convert a `long double' in IEEE854 standard double-precision format to a + multi-precision integer representing the significand scaled up by its + number of bits (64 for long double) and an integral power of two + (MPN frexpl). */ + +mp_size_t +__mpn_extract_long_double (mp_ptr res_ptr, mp_size_t size, + int *expt, int *is_neg, + long double value) +{ + union ieee854_long_double u; + u.d = value; + + *is_neg = u.ieee.negative; + *expt = (int) u.ieee.exponent - IEEE854_LONG_DOUBLE_BIAS; + +#if BITS_PER_MP_LIMB == 32 + res_ptr[0] = u.ieee.mantissa1; /* Low-order 32 bits of fraction. */ + res_ptr[1] = u.ieee.mantissa0; /* High-order 32 bits. */ + #define N 2 +#elif BITS_PER_MP_LIMB == 64 + /* Hopefully the compiler will combine the two bitfield extracts + and this composition into just the original quadword extract. */ + res_ptr[0] = ((mp_limb_t) u.ieee.mantissa0 << 32) | u.ieee.mantissa1; + #define N 1 +#else + #error "mp_limb size " BITS_PER_MP_LIMB "not accounted for" +#endif + + if (u.ieee.exponent == 0) + { + /* A biased exponent of zero is a special case. + Either it is a zero or it is a denormal number. */ + if (res_ptr[0] == 0 && res_ptr[N - 1] == 0) /* Assumes N<=2. */ + /* It's zero. */ + *expt = 0; + else + { + /* It is a denormal number, meaning it has no implicit leading + one bit, and its exponent is in fact the format minimum. */ + int cnt; + + /* One problem with Intel's 80-bit format is that the explicit + leading one in the normalized representation has to be zero + for denormalized number. If it is one, the number is according + to Intel's specification an invalid number. We make the + representation unique by explicitly clearing this bit. */ + res_ptr[N - 1] &= ~((mp_limb_t) 1 << ((LDBL_MANT_DIG - 1) % BITS_PER_MP_LIMB)); + + if (res_ptr[N - 1] != 0) + { + count_leading_zeros (cnt, res_ptr[N - 1]); + if (cnt != 0) + { +#if N == 2 + res_ptr[N - 1] = res_ptr[N - 1] << cnt + | (res_ptr[0] >> (BITS_PER_MP_LIMB - cnt)); + res_ptr[0] <<= cnt; +#else + res_ptr[N - 1] <<= cnt; +#endif + } + *expt = LDBL_MIN_EXP - 1 - cnt; + } + else if (res_ptr[0] != 0) + { + count_leading_zeros (cnt, res_ptr[0]); + res_ptr[N - 1] = res_ptr[0] << cnt; + res_ptr[0] = 0; + *expt = LDBL_MIN_EXP - 1 - BITS_PER_MP_LIMB - cnt; + } + else + { + /* This is the special case of the pseudo denormal number + with only the implicit leading bit set. The value is + in fact a normal number and so we have to treat this + case differently. */ +#if N == 2 + res_ptr[N - 1] = 0x80000000ul; +#else + res_ptr[0] = 0x8000000000000000ul; +#endif + *expt = LDBL_MIN_EXP - 1; + } + } + } + else if (u.ieee.exponent < 0x7fff +#if N == 2 + && res_ptr[0] == 0 +#endif + && res_ptr[N - 1] == 0) + /* Pseudo zero. */ + *expt = 0; + + return N; +} diff --git a/REORG.TODO/sysdeps/i386/ldsodefs.h b/REORG.TODO/sysdeps/i386/ldsodefs.h new file mode 100644 index 0000000000..a369f5fc68 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/ldsodefs.h @@ -0,0 +1,41 @@ +/* Run-time dynamic linker data structures for loaded ELF shared objects. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _I386_LDSODEFS_H +#define _I386_LDSODEFS_H 1 + +#include <elf.h> +#include <cpu-features.h> + +struct La_i86_regs; +struct La_i86_retval; + +#define ARCH_PLTENTER_MEMBERS \ + Elf32_Addr (*i86_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \ + uintptr_t *, struct La_i86_regs *, \ + unsigned int *, const char *name, \ + long int *framesizep) + +#define ARCH_PLTEXIT_MEMBERS \ + unsigned int (*i86_gnu_pltexit) (Elf32_Sym *, unsigned int, uintptr_t *, \ + uintptr_t *, const struct La_i86_regs *, \ + struct La_i86_retval *, const char *) + +#include_next <ldsodefs.h> + +#endif diff --git a/REORG.TODO/sysdeps/i386/link-defines.sym b/REORG.TODO/sysdeps/i386/link-defines.sym new file mode 100644 index 0000000000..0995adb37f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/link-defines.sym @@ -0,0 +1,20 @@ +#include "link.h" +#include <stddef.h> + +-- +LONG_DOUBLE_SIZE sizeof (long double) + +LR_SIZE sizeof (struct La_i86_regs) +LR_EDX_OFFSET offsetof (struct La_i86_regs, lr_edx) +LR_ECX_OFFSET offsetof (struct La_i86_regs, lr_ecx) +LR_EAX_OFFSET offsetof (struct La_i86_regs, lr_eax) +LR_EBP_OFFSET offsetof (struct La_i86_regs, lr_ebp) +LR_ESP_OFFSET offsetof (struct La_i86_regs, lr_esp) + +LRV_SIZE sizeof (struct La_i86_retval) +LRV_EAX_OFFSET offsetof (struct La_i86_retval, lrv_eax) +LRV_EDX_OFFSET offsetof (struct La_i86_retval, lrv_edx) +LRV_ST0_OFFSET offsetof (struct La_i86_retval, lrv_st0) +LRV_ST1_OFFSET offsetof (struct La_i86_retval, lrv_st1) +LRV_BND0_OFFSET offsetof (struct La_i86_retval, lrv_bnd0) +LRV_BND1_OFFSET offsetof (struct La_i86_retval, lrv_bnd1) diff --git a/REORG.TODO/sysdeps/i386/lshift.S b/REORG.TODO/sysdeps/i386/lshift.S new file mode 100644 index 0000000000..fa4b07793f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/lshift.S @@ -0,0 +1,103 @@ +/* i80386 __mpn_lshift -- + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+12 /* space for 3 saved regs */ +#define RES PARMS +#define S RES+4 +#define SIZE S+4 +#define CNT SIZE+4 + + .text +ENTRY (__mpn_lshift) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 8) + movl S(%esp),%esi + cfi_rel_offset (esi, 4) + movl SIZE(%esp),%edx + movl CNT(%esp),%ecx + subl $4,%esi /* adjust s_ptr */ + + movl (%esi,%edx,4),%ebx /* read most significant limb */ + cfi_rel_offset (ebx, 0) + cfi_remember_state + xorl %eax,%eax + shldl %cl,%ebx,%eax /* compute carry limb */ + decl %edx + jz L(end) + pushl %eax /* push carry limb onto stack */ + cfi_adjust_cfa_offset (4) + testb $1,%dl + jnz L(1) /* enter loop in the middle */ + movl %ebx,%eax + + ALIGN (3) +L(oop): movl (%esi,%edx,4),%ebx /* load next lower limb */ + shldl %cl,%ebx,%eax /* compute result limb */ + movl %eax,(%edi,%edx,4) /* store it */ + decl %edx +L(1): movl (%esi,%edx,4),%eax + shldl %cl,%eax,%ebx + movl %ebx,(%edi,%edx,4) + decl %edx + jnz L(oop) + + shll %cl,%eax /* compute least significant limb */ + movl %eax,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + cfi_adjust_cfa_offset (-4) + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret + + cfi_restore_state +L(end): shll %cl,%ebx /* compute least significant limb */ + movl %ebx,(%edi) /* store it */ + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_lshift) diff --git a/REORG.TODO/sysdeps/i386/machine-gmon.h b/REORG.TODO/sysdeps/i386/machine-gmon.h new file mode 100644 index 0000000000..d5d8cdf7c6 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/machine-gmon.h @@ -0,0 +1,40 @@ +/* i386-specific implementation of profiling support. + Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* We need a special version of the `mcount' function since for ix86 it + must not clobber any register. This has several reasons: + - there is a bug in gcc as of version 2.7.2.2 which prohibits the + use of profiling together with nested functions + - the ELF `fixup' function uses GCC's regparm feature + - some (future) systems might want to pass parameters in registers. */ + +/* We must not pollute the global namespace. */ +#define mcount_internal __mcount_internal + +extern void mcount_internal (u_long frompc, u_long selfpc) internal_function; + +#define _MCOUNT_DECL(frompc, selfpc) \ +void internal_function mcount_internal (u_long frompc, u_long selfpc) + + +/* Define MCOUNT as empty since we have the implementation in another + file. */ +#define MCOUNT diff --git a/REORG.TODO/sysdeps/i386/memchr.S b/REORG.TODO/sysdeps/i386/memchr.S new file mode 100644 index 0000000000..db4a6418ff --- /dev/null +++ b/REORG.TODO/sysdeps/i386/memchr.S @@ -0,0 +1,322 @@ +/* memchr (str, chr, len) -- Return pointer to first occurrence of CHR in STR + less than LEN. For Intel 80x86, x>=3. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu> + Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au> + This version is developed using the same algorithm as the fast C + version which carries the following introduction: + Based on strlen implementation by Torbjorn Granlund (tege@sics.se), + with help from Dan Sahlin (dan@sics.se) and + commentary by Jim Blandy (jimb@ai.mit.edu); + adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu), + and implemented by Roland McGrath (roland@ai.mit.edu). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+8 /* space for 2 saved regs */ +#define RTN PARMS +#define STR RTN +#define CHR STR+4 +#define LEN CHR+4 + + .text +ENTRY (__memchr) + + /* Save callee-safe registers used in this function. */ + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %edi + cfi_adjust_cfa_offset (4) + cfi_rel_offset (edi, 0) + + /* Load parameters into registers. */ + movl STR(%esp), %eax /* str: pointer to memory block. */ + movl CHR(%esp), %edx /* c: byte we are looking for. */ + movl LEN(%esp), %esi /* len: length of memory block. */ + cfi_rel_offset (esi, 4) + + /* If my must not test more than three characters test + them one by one. This is especially true for 0. */ + cmpl $4, %esi + jb L(3) + + /* At the moment %edx contains CHR. What we need for the + algorithm is CHR in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* Now it is 0|0|c|c */ + movl %edx, %ecx + shll $16, %edx /* Now c|c|0|0 */ + movw %cx, %dx /* And finally c|c|c|c */ + + /* Better performance can be achieved if the word (32 + bit) memory access is aligned on a four-byte-boundary. + So process first bytes one by one until boundary is + reached. Don't use a loop for better performance. */ + + testb $3, %al /* correctly aligned ? */ + je L(2) /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L(9) /* target found => return */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length counter */ + je L(4) /* len==0 => return NULL */ + + testb $3, %al /* correctly aligned ? */ + je L(2) /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L(9) /* target found => return */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length counter */ + je L(4) /* len==0 => return NULL */ + + testb $3, %al /* correctly aligned ? */ + je L(2) /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L(9) /* target found => return */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length counter */ + /* no test for len==0 here, because this is done in the + loop head */ + jmp L(2) + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for CHR, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is CHR. This turns each byte that is CHR + into a zero. */ + + + /* Each round the main loop processes 16 bytes. */ + + ALIGN (4) + +L(1): movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occurred in the last byte => it was 0. */ + jnc L(8) + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is CHR we don't get 0 in %edi. */ + jnz L(8) /* found it => return pointer */ + + /* This process is unfolded four times for better performance. + we don't increment the source pointer each time. Instead we + use offsets and increment by 16 in each run of the loop. But + before probing for the matching byte we need some extra code + (following LL(13) below). Even the len can be compared with + constants instead of decrementing each time. */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(7) /* highest byte is CHR => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(7) /* found it => return pointer */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(6) /* highest byte is CHR => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(6) /* found it => return pointer */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(5) /* highest byte is CHR => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(5) /* found it => return pointer */ + + /* Adjust both counters for a full round, i.e. 16 bytes. */ + addl $16, %eax +L(2): subl $16, %esi + jae L(1) /* Still more than 16 bytes remaining */ + + /* Process remaining bytes separately. */ + cmpl $4-16, %esi /* rest < 4 bytes? */ + jb L(3) /* yes, than test byte by byte */ + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(8) /* highest byte is CHR => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jne L(8) /* found it => return pointer */ + addl $4, %eax /* adjust source pointer */ + + cmpl $8-16, %esi /* rest < 8 bytes? */ + jb L(3) /* yes, than test byte by byte */ + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(8) /* highest byte is CHR => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jne L(8) /* found it => return pointer */ + addl $4, %eax /* adjust source pointer */ + + cmpl $12-16, %esi /* rest < 12 bytes? */ + jb L(3) /* yes, than test byte by byte */ + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(8) /* highest byte is CHR => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jne L(8) /* found it => return pointer */ + addl $4, %eax /* adjust source pointer */ + + /* Check the remaining bytes one by one. */ +L(3): andl $3, %esi /* mask out uninteresting bytes */ + jz L(4) /* no remaining bytes => return NULL */ + + cmpb %dl, (%eax) /* compare byte with CHR */ + je L(9) /* equal, than return pointer */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length */ + jz L(4) /* no remaining bytes => return NULL */ + + cmpb %dl, (%eax) /* compare byte with CHR */ + je L(9) /* equal, than return pointer */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length */ + jz L(4) /* no remaining bytes => return NULL */ + + cmpb %dl, (%eax) /* compare byte with CHR */ + je L(9) /* equal, than return pointer */ + +L(4): /* no byte found => return NULL */ + xorl %eax, %eax + jmp L(9) + + /* add missing source pointer increments */ +L(5): addl $4, %eax +L(6): addl $4, %eax +L(7): addl $4, %eax + + /* Test for the matching byte in the word. %ecx contains a NUL + char in the byte which originally was the byte we are looking + at. */ +L(8): testb %cl, %cl /* test first byte in dword */ + jz L(9) /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + testb %ch, %ch /* test second byte in dword */ + jz L(9) /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + testl $0xff0000, %ecx /* test third byte in dword */ + jz L(9) /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + /* No further test needed we we know it is one of the four bytes. */ +L(9): popl %edi /* pop saved registers */ + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + + ret +END (__memchr) + +weak_alias (__memchr, memchr) +libc_hidden_builtin_def (memchr) diff --git a/REORG.TODO/sysdeps/i386/memcmp.S b/REORG.TODO/sysdeps/i386/memcmp.S new file mode 100644 index 0000000000..01f8f8ef03 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/memcmp.S @@ -0,0 +1,73 @@ +/* Compare two memory blocks for differences in the first COUNT bytes. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* space for 1 saved reg */ +#define BLK1 PARMS +#define BLK2 BLK1+4 +#define LEN BLK2+4 + + .text +ENTRY (memcmp) + + pushl %esi /* Save callee-safe registers. */ + cfi_adjust_cfa_offset (4) + movl %edi, %edx /* Note that %edx is not used and can + so be used to save %edi. It's faster. */ + cfi_register (edi, edx) + + movl BLK1(%esp), %esi + cfi_rel_offset (esi, 0) + movl BLK2(%esp), %edi + movl LEN(%esp), %ecx + + cld /* Set direction of comparison. */ + + xorl %eax, %eax /* Default result. */ + + repe /* Compare at most %ecx bytes. */ + cmpsb + jz L(1) /* If even last byte was equal we return 0. */ + + /* The memory blocks are not equal. So result of the last + subtraction is present in the carry flag. It is set when + the byte in block #2 is bigger. In this case we have to + return -1 (=0xffffffff), else 1. */ + sbbl %eax, %eax /* This is tricky. %eax == 0 and carry is set + or not depending on last subtraction. */ + + /* At this point %eax == 0, if the byte of block #1 was bigger, and + 0xffffffff if the last byte of block #2 was bigger. The latter + case is already correct but the former needs a little adjustment. + Note that the following operation does not change 0xffffffff. */ + orb $1, %al /* Change 0 to 1. */ + +L(1): popl %esi /* Restore registers. */ + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + movl %edx, %edi + cfi_restore (edi) + + ret +END (memcmp) + +#undef bcmp +weak_alias (memcmp, bcmp) +libc_hidden_builtin_def (memcmp) diff --git a/REORG.TODO/sysdeps/i386/memcopy.h b/REORG.TODO/sysdeps/i386/memcopy.h new file mode 100644 index 0000000000..dc6173ee29 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/memcopy.h @@ -0,0 +1,92 @@ +/* memcopy.h -- definitions for memory copy functions. i386 version. + Copyright (C) 1991-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Torbjorn Granlund (tege@sics.se). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdeps/generic/memcopy.h> + +#undef OP_T_THRES +#define OP_T_THRES 8 + +#undef BYTE_COPY_FWD +#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes) \ + do { \ + int __d0; \ + asm volatile(/* Clear the direction flag, so copying goes forward. */ \ + "cld\n" \ + /* Copy bytes. */ \ + "rep\n" \ + "movsb" : \ + "=D" (dst_bp), "=S" (src_bp), "=c" (__d0) : \ + "0" (dst_bp), "1" (src_bp), "2" (nbytes) : \ + "memory"); \ + } while (0) + +#undef BYTE_COPY_BWD +#define BYTE_COPY_BWD(dst_ep, src_ep, nbytes) \ + do \ + { \ + int __d0; \ + asm volatile(/* Set the direction flag, so copying goes backwards. */ \ + "std\n" \ + /* Copy bytes. */ \ + "rep\n" \ + "movsb\n" \ + /* Clear the dir flag. Convention says it should be 0. */ \ + "cld" : \ + "=D" (dst_ep), "=S" (src_ep), "=c" (__d0) : \ + "0" (dst_ep - 1), "1" (src_ep - 1), "2" (nbytes) : \ + "memory"); \ + dst_ep += 1; \ + src_ep += 1; \ + } while (0) + +#undef WORD_COPY_FWD +#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \ + do \ + { \ + int __d0; \ + asm volatile(/* Clear the direction flag, so copying goes forward. */ \ + "cld\n" \ + /* Copy longwords. */ \ + "rep\n" \ + "movsl" : \ + "=D" (dst_bp), "=S" (src_bp), "=c" (__d0) : \ + "0" (dst_bp), "1" (src_bp), "2" ((nbytes) / 4) : \ + "memory"); \ + (nbytes_left) = (nbytes) % 4; \ + } while (0) + +#undef WORD_COPY_BWD +#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes) \ + do \ + { \ + int __d0; \ + asm volatile(/* Set the direction flag, so copying goes backwards. */ \ + "std\n" \ + /* Copy longwords. */ \ + "rep\n" \ + "movsl\n" \ + /* Clear the dir flag. Convention says it should be 0. */ \ + "cld" : \ + "=D" (dst_ep), "=S" (src_ep), "=c" (__d0) : \ + "0" (dst_ep - 4), "1" (src_ep - 4), "2" ((nbytes) / 4) : \ + "memory"); \ + dst_ep += 4; \ + src_ep += 4; \ + (nbytes_left) = (nbytes) % 4; \ + } while (0) diff --git a/REORG.TODO/sysdeps/i386/memcpy.S b/REORG.TODO/sysdeps/i386/memcpy.S new file mode 100644 index 0000000000..06568ea724 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/memcpy.S @@ -0,0 +1,95 @@ +/* memcpy with REP MOVSB/STOSB + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY memcpy +# define MEMCPY_CHK __memcpy_chk +#endif + +#ifdef USE_AS_BCOPY +# define STR2 12 +# define STR1 STR2+4 +# define N STR1+4 +#else +# define STR1 12 +# define STR2 STR1+4 +# define N STR2+4 +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + + .text +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BCOPY +ENTRY (MEMCPY_CHK) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif +ENTRY (MEMCPY) + PUSH (%esi) + PUSH (%edi) + movl N(%esp), %ecx + movl STR1(%esp), %edi + movl STR2(%esp), %esi + mov %edi, %eax +#ifdef USE_AS_MEMPCPY + add %ecx, %eax +#endif + +#ifdef USE_AS_MEMMOVE + cmp %esi, %edi + ja L(copy_backward) + je L(bwd_write_0bytes) +#endif + + rep movsb + POP (%edi) + POP (%esi) + ret + +#ifdef USE_AS_MEMMOVE +L(copy_backward): + lea -1(%edi,%ecx), %edi + lea -1(%esi,%ecx), %esi + std + rep movsb + cld +L(bwd_write_0bytes): + POP (%edi) + POP (%esi) + ret +#endif + +END (MEMCPY) + +#ifndef USE_AS_BCOPY +libc_hidden_builtin_def (MEMCPY) +#endif diff --git a/REORG.TODO/sysdeps/i386/memcpy_chk.S b/REORG.TODO/sysdeps/i386/memcpy_chk.S new file mode 100644 index 0000000000..0f6f585c41 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/memcpy_chk.S @@ -0,0 +1,34 @@ +/* Checking memcpy for i386. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef SHARED +# include <sysdep.h> +# include "asm-syntax.h" + + /* For libc.so this is defined in memcpy.S. + For libc.a, this is a separate source to avoid + memcpy bringing in __chk_fail and all routines + it calls. */ + .text +ENTRY (__memcpy_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp memcpy +END (__memcpy_chk) +#endif diff --git a/REORG.TODO/sysdeps/i386/memmove.S b/REORG.TODO/sysdeps/i386/memmove.S new file mode 100644 index 0000000000..60a45d21e0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/memmove.S @@ -0,0 +1,4 @@ +#define USE_AS_MEMMOVE +#define MEMCPY memmove +#define MEMCPY_CHK __memmove_chk +#include "memcpy.S" diff --git a/REORG.TODO/sysdeps/i386/memmove_chk.S b/REORG.TODO/sysdeps/i386/memmove_chk.S new file mode 100644 index 0000000000..0c7037cc05 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/memmove_chk.S @@ -0,0 +1,33 @@ +/* Checking memmove for i386 + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef SHARED +# include <sysdep.h> +# include "asm-syntax.h" + +/* For libc.so this is defined in memmove.S. For libc.a, this is a + separate source to avoid memmove bringing in __chk_fail and all + routines it calls. */ + .text +ENTRY (__memmove_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp memmove +END (__memmove_chk) +#endif diff --git a/REORG.TODO/sysdeps/i386/mempcpy.S b/REORG.TODO/sysdeps/i386/mempcpy.S new file mode 100644 index 0000000000..61addb75f4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/mempcpy.S @@ -0,0 +1,7 @@ +#define USE_AS_MEMPCPY +#define MEMCPY __mempcpy +#define MEMCPY_CHK __mempcpy_chk +#include "memcpy.S" + +weak_alias (__mempcpy, mempcpy) +libc_hidden_builtin_def (mempcpy) diff --git a/REORG.TODO/sysdeps/i386/mempcpy_chk.S b/REORG.TODO/sysdeps/i386/mempcpy_chk.S new file mode 100644 index 0000000000..4d8ac5c25b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/mempcpy_chk.S @@ -0,0 +1,33 @@ +/* Checking mempcpy for i386 + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef SHARED +# include <sysdep.h> +# include "asm-syntax.h" + +/* For libc.so this is defined in mempcpy.S. For libc.a, this is a + separate source to avoid mempcpy bringing in __chk_fail and all + routines it calls. */ + .text +ENTRY (__mempcpy_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp mempcpy +END (__mempcpy_chk) +#endif diff --git a/REORG.TODO/sysdeps/i386/memset.S b/REORG.TODO/sysdeps/i386/memset.S new file mode 100644 index 0000000000..46ae65d2e4 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/memset.S @@ -0,0 +1,68 @@ +/* memset with REP MOVSB/STOSB + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#define STR1 8 +#ifdef USE_AS_BZERO +#define N STR1+4 +#else +#define STR2 STR1+4 +#define N STR2+4 +#endif + + .text +#if defined SHARED && IS_IN (libc) && !defined USE_AS_BZERO +ENTRY (__memset_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb HIDDEN_JUMPTARGET (__chk_fail) +END (__memset_chk) +#endif +ENTRY (memset) + PUSH (%edi) + movl N(%esp), %ecx + movl STR1(%esp), %edi +#ifdef USE_AS_BZERO + xor %eax, %eax +#else + movzbl STR2(%esp), %eax + mov %edi, %edx +#endif + rep stosb +#ifndef USE_AS_BZERO + mov %edx, %eax +#endif + POP (%edi) + ret +END (memset) + +#ifndef USE_AS_BZERO +libc_hidden_builtin_def (memset) +#endif diff --git a/REORG.TODO/sysdeps/i386/memset_chk.S b/REORG.TODO/sysdeps/i386/memset_chk.S new file mode 100644 index 0000000000..da7837111e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/memset_chk.S @@ -0,0 +1,33 @@ +/* Checking memset for i386. + Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef SHARED +# include <sysdep.h> +# include "asm-syntax.h" + +/* For libc.so this is defined in memset.S. For libc.a, this is a + separate source to avoid memset bringing in __chk_fail and all + routines it calls. */ + .text +ENTRY (__memset_chk) + movl 12(%esp), %eax + cmpl %eax, 16(%esp) + jb __chk_fail + jmp memset +END (__memset_chk) +#endif diff --git a/REORG.TODO/sysdeps/i386/memusage.h b/REORG.TODO/sysdeps/i386/memusage.h new file mode 100644 index 0000000000..30167be833 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/memusage.h @@ -0,0 +1,20 @@ +/* Copyright (C) 2000-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define GETSP() ({ register uintptr_t stack_ptr asm ("esp"); stack_ptr; }) + +#include <sysdeps/generic/memusage.h> diff --git a/REORG.TODO/sysdeps/i386/mp_clz_tab.c b/REORG.TODO/sysdeps/i386/mp_clz_tab.c new file mode 100644 index 0000000000..860f98cc62 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/mp_clz_tab.c @@ -0,0 +1 @@ +/* __clz_tab not needed on i386. */ diff --git a/REORG.TODO/sysdeps/i386/mul_1.S b/REORG.TODO/sysdeps/i386/mul_1.S new file mode 100644 index 0000000000..cf83d1b343 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/mul_1.S @@ -0,0 +1,86 @@ +/* i80386 __mpn_mul_1 -- Multiply a limb vector with a limb and store + the result in a second limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define SIZE S1+4 +#define S2LIMB SIZE+4 + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebx + + .text +ENTRY (__mpn_mul_1) + + pushl %res_ptr + cfi_adjust_cfa_offset (4) + pushl %s1_ptr + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %s2_limb + cfi_adjust_cfa_offset (4) + + movl RES(%esp), %res_ptr + cfi_rel_offset (res_ptr, 12) + movl S1(%esp), %s1_ptr + cfi_rel_offset (s1_ptr, 8) + movl SIZE(%esp), %size + movl S2LIMB(%esp), %s2_limb + cfi_rel_offset (s2_limb, 0) + leal (%res_ptr,%size,4), %res_ptr + leal (%s1_ptr,%size,4), %s1_ptr + negl %size + xorl %ebp, %ebp + cfi_rel_offset (ebp, 4) + ALIGN (3) +L(oop): + movl (%s1_ptr,%size,4), %eax + mull %s2_limb + addl %ebp, %eax + movl %eax, (%res_ptr,%size,4) + adcl $0, %edx + movl %edx, %ebp + + incl %size + jnz L(oop) + movl %ebp, %eax + + popl %s2_limb + cfi_adjust_cfa_offset (-4) + cfi_restore (s2_limb) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %s1_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (s1_ptr) + popl %res_ptr + cfi_adjust_cfa_offset (-4) + cfi_restore (res_ptr) + + ret +#undef size +END (__mpn_mul_1) diff --git a/REORG.TODO/sysdeps/i386/nptl/Makefile b/REORG.TODO/sysdeps/i386/nptl/Makefile new file mode 100644 index 0000000000..2c61b352eb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/nptl/Makefile @@ -0,0 +1,26 @@ +# Copyright (C) 2002-2017 Free Software Foundation, Inc. +# This file is part of the GNU C Library. + +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. + +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. + +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +ifeq ($(subdir),csu) +gen-as-const-headers += tcb-offsets.sym +endif + +ifeq ($(subdir),nptl) +CFLAGS-pthread_create.c += -mpreferred-stack-boundary=4 +CFLAGS-tst-align.c += -mpreferred-stack-boundary=4 +CFLAGS-tst-align2.c += -mpreferred-stack-boundary=4 +endif diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c new file mode 100644 index 0000000000..a1205b9698 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_init.c @@ -0,0 +1,19 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Not needed. pthread_spin_init is an alias for pthread_spin_unlock. */ diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S new file mode 100644 index 0000000000..160244b7a8 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_lock.S @@ -0,0 +1,37 @@ +/* Copyright (C) 2012-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <lowlevellock.h> + + .globl pthread_spin_lock + .type pthread_spin_lock,@function + .align 16 +pthread_spin_lock: + mov 4(%esp), %eax +1: LOCK + decl 0(%eax) + jne 2f + xor %eax, %eax + ret + + .align 16 +2: rep + nop + cmpl $0, 0(%eax) + jg 1b + jmp 2b + .size pthread_spin_lock,.-pthread_spin_lock diff --git a/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S new file mode 100644 index 0000000000..b6636ae8d7 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/nptl/pthread_spin_unlock.S @@ -0,0 +1,31 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + + .globl pthread_spin_unlock + .type pthread_spin_unlock,@function + .align 16 +pthread_spin_unlock: + movl 4(%esp), %eax + movl $1, (%eax) + xorl %eax, %eax + ret + .size pthread_spin_unlock,.-pthread_spin_unlock + + /* The implementation of pthread_spin_init is identical. */ + .globl pthread_spin_init +pthread_spin_init = pthread_spin_unlock diff --git a/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h b/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h new file mode 100644 index 0000000000..54abccd11b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/nptl/pthreaddef.h @@ -0,0 +1,40 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Default stack size. */ +#define ARCH_STACK_DEFAULT_SIZE (2 * 1024 * 1024) + +/* Required stack pointer alignment at beginning. SSE requires 16 + bytes. */ +#define STACK_ALIGN 16 + +/* Minimal stack size after allocating thread descriptor and guard size. */ +#define MINIMAL_REST_STACK 2048 + +/* Alignment requirement for TCB. + + Some processors such as Intel Atom pay a big penalty on every + access using a segment override if that segment's base is not + aligned to the size of a cache line. (See Intel 64 and IA-32 + Architectures Optimization Reference Manual, section 13.3.3.3, + "Segment Base".) On such machines, a cache line is 64 bytes. */ +#define TCB_ALIGNMENT 64 + + +/* Location of current stack frame. */ +#define CURRENT_STACK_FRAME __builtin_frame_address (0) diff --git a/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym b/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym new file mode 100644 index 0000000000..695a810386 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/nptl/tcb-offsets.sym @@ -0,0 +1,17 @@ +#include <sysdep.h> +#include <tls.h> +#include <kernel-features.h> + +RESULT offsetof (struct pthread, result) +TID offsetof (struct pthread, tid) +CANCELHANDLING offsetof (struct pthread, cancelhandling) +CLEANUP_JMP_BUF offsetof (struct pthread, cleanup_jmp_buf) +MULTIPLE_THREADS_OFFSET offsetof (tcbhead_t, multiple_threads) +SYSINFO_OFFSET offsetof (tcbhead_t, sysinfo) +CLEANUP offsetof (struct pthread, cleanup) +CLEANUP_PREV offsetof (struct _pthread_cleanup_buffer, __prev) +MUTEX_FUTEX offsetof (pthread_mutex_t, __data.__lock) +POINTER_GUARD offsetof (tcbhead_t, pointer_guard) +#ifndef __ASSUME_PRIVATE_FUTEX +PRIVATE_FUTEX offsetof (tcbhead_t, private_futex) +#endif diff --git a/REORG.TODO/sysdeps/i386/nptl/tls.h b/REORG.TODO/sysdeps/i386/nptl/tls.h new file mode 100644 index 0000000000..f9a6b11ecf --- /dev/null +++ b/REORG.TODO/sysdeps/i386/nptl/tls.h @@ -0,0 +1,435 @@ +/* Definition for thread-local data handling. nptl/i386 version. + Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _TLS_H +#define _TLS_H 1 + +#include <dl-sysdep.h> +#ifndef __ASSEMBLER__ +# include <stdbool.h> +# include <stddef.h> +# include <stdint.h> +# include <stdlib.h> +# include <sysdep.h> +# include <libc-pointer-arith.h> /* For cast_to_integer. */ +# include <kernel-features.h> +# include <dl-dtv.h> + +typedef struct +{ + void *tcb; /* Pointer to the TCB. Not necessarily the + thread descriptor used by libpthread. */ + dtv_t *dtv; + void *self; /* Pointer to the thread descriptor. */ + int multiple_threads; + uintptr_t sysinfo; + uintptr_t stack_guard; + uintptr_t pointer_guard; + int gscope_flag; +#ifndef __ASSUME_PRIVATE_FUTEX + int private_futex; +#else + int __glibc_reserved1; +#endif + /* Reservation of some values for the TM ABI. */ + void *__private_tm[4]; + /* GCC split stack support. */ + void *__private_ss; +} tcbhead_t; + +# define TLS_MULTIPLE_THREADS_IN_TCB 1 + +#else /* __ASSEMBLER__ */ +# include <tcb-offsets.h> +#endif + + +/* Alignment requirement for the stack. For IA-32 this is governed by + the SSE memory functions. */ +#define STACK_ALIGN 16 + +#ifndef __ASSEMBLER__ +/* Get system call information. */ +# include <sysdep.h> + +/* The old way: using LDT. */ + +/* Structure passed to `modify_ldt', 'set_thread_area', and 'clone' calls. */ +struct user_desc +{ + unsigned int entry_number; + unsigned long int base_addr; + unsigned int limit; + unsigned int seg_32bit:1; + unsigned int contents:2; + unsigned int read_exec_only:1; + unsigned int limit_in_pages:1; + unsigned int seg_not_present:1; + unsigned int useable:1; + unsigned int empty:25; +}; + +/* Initializing bit fields is slow. We speed it up by using a union. */ +union user_desc_init +{ + struct user_desc desc; + unsigned int vals[4]; +}; + + +/* This is the size of the initial TCB. Can't be just sizeof (tcbhead_t), + because NPTL getpid, __libc_alloca_cutoff etc. need (almost) the whole + struct pthread even when not linked with -lpthread. */ +# define TLS_INIT_TCB_SIZE sizeof (struct pthread) + +/* Alignment requirements for the initial TCB. */ +# define TLS_INIT_TCB_ALIGN __alignof__ (struct pthread) + +/* This is the size of the TCB. */ +# define TLS_TCB_SIZE sizeof (struct pthread) + +/* Alignment requirements for the TCB. */ +# define TLS_TCB_ALIGN __alignof__ (struct pthread) + +/* The TCB can have any size and the memory following the address the + thread pointer points to is unspecified. Allocate the TCB there. */ +# define TLS_TCB_AT_TP 1 +# define TLS_DTV_AT_TP 0 + +/* Get the thread descriptor definition. */ +# include <nptl/descr.h> + + +/* Install the dtv pointer. The pointer passed is to the element with + index -1 which contain the length. */ +# define INSTALL_DTV(descr, dtvp) \ + ((tcbhead_t *) (descr))->dtv = (dtvp) + 1 + +/* Install new dtv for current thread. */ +# define INSTALL_NEW_DTV(dtvp) \ + ({ struct pthread *__pd; \ + THREAD_SETMEM (__pd, header.dtv, (dtvp)); }) + +/* Return dtv of given thread descriptor. */ +# define GET_DTV(descr) \ + (((tcbhead_t *) (descr))->dtv) + +/* Macros to load from and store into segment registers. */ +# ifndef TLS_GET_GS +# define TLS_GET_GS() \ + ({ int __seg; __asm ("movw %%gs, %w0" : "=q" (__seg)); __seg & 0xffff; }) +# endif +# ifndef TLS_SET_GS +# define TLS_SET_GS(val) \ + __asm ("movw %w0, %%gs" :: "q" (val)) +# endif + +#ifdef NEED_DL_SYSINFO +# define INIT_SYSINFO \ + _head->sysinfo = GLRO(dl_sysinfo) +# define SETUP_THREAD_SYSINFO(pd) \ + ((pd)->header.sysinfo = THREAD_GETMEM (THREAD_SELF, header.sysinfo)) +# define CHECK_THREAD_SYSINFO(pd) \ + assert ((pd)->header.sysinfo == THREAD_GETMEM (THREAD_SELF, header.sysinfo)) +#else +# define INIT_SYSINFO +#endif + +#ifndef LOCK_PREFIX +# ifdef UP +# define LOCK_PREFIX /* nothing */ +# else +# define LOCK_PREFIX "lock;" +# endif +#endif + +static inline void __attribute__ ((unused, always_inline)) +tls_fill_user_desc (union user_desc_init *desc, + unsigned int entry_number, + void *pd) +{ + desc->vals[0] = entry_number; + /* The 'base_addr' field. Pointer to the TCB. */ + desc->vals[1] = (unsigned long int) pd; + /* The 'limit' field. We use 4GB which is 0xfffff pages. */ + desc->vals[2] = 0xfffff; + /* Collapsed value of the bitfield: + .seg_32bit = 1 + .contents = 0 + .read_exec_only = 0 + .limit_in_pages = 1 + .seg_not_present = 0 + .useable = 1 */ + desc->vals[3] = 0x51; +} + +/* Code to initially initialize the thread pointer. This might need + special attention since 'errno' is not yet available and if the + operation can cause a failure 'errno' must not be touched. */ +# define TLS_INIT_TP(thrdescr) \ + ({ void *_thrdescr = (thrdescr); \ + tcbhead_t *_head = _thrdescr; \ + union user_desc_init _segdescr; \ + int _result; \ + \ + _head->tcb = _thrdescr; \ + /* For now the thread descriptor is at the same address. */ \ + _head->self = _thrdescr; \ + /* New syscall handling support. */ \ + INIT_SYSINFO; \ + \ + /* Let the kernel pick a value for the 'entry_number' field. */ \ + tls_fill_user_desc (&_segdescr, -1, _thrdescr); \ + \ + /* Install the TLS. */ \ + INTERNAL_SYSCALL_DECL (err); \ + _result = INTERNAL_SYSCALL (set_thread_area, err, 1, &_segdescr.desc); \ + \ + if (_result == 0) \ + /* We know the index in the GDT, now load the segment register. \ + The use of the GDT is described by the value 3 in the lower \ + three bits of the segment descriptor value. \ + \ + Note that we have to do this even if the numeric value of \ + the descriptor does not change. Loading the segment register \ + causes the segment information from the GDT to be loaded \ + which is necessary since we have changed it. */ \ + TLS_SET_GS (_segdescr.desc.entry_number * 8 + 3); \ + \ + _result == 0 ? NULL \ + : "set_thread_area failed when setting up thread-local storage\n"; }) + +# define TLS_DEFINE_INIT_TP(tp, pd) \ + union user_desc_init _segdescr; \ + /* Find the 'entry_number' field that the kernel selected in TLS_INIT_TP. \ + The first three bits of the segment register value select the GDT, \ + ignore them. We get the index from the value of the %gs register in \ + the current thread. */ \ + tls_fill_user_desc (&_segdescr, TLS_GET_GS () >> 3, pd); \ + const struct user_desc *tp = &_segdescr.desc + + +/* Return the address of the dtv for the current thread. */ +# define THREAD_DTV() \ + ({ struct pthread *__pd; \ + THREAD_GETMEM (__pd, header.dtv); }) + + +/* Return the thread descriptor for the current thread. + + The contained asm must *not* be marked volatile since otherwise + assignments like + pthread_descr self = thread_self(); + do not get optimized away. */ +# define THREAD_SELF \ + ({ struct pthread *__self; \ + asm ("movl %%gs:%c1,%0" : "=r" (__self) \ + : "i" (offsetof (struct pthread, header.self))); \ + __self;}) + +/* Magic for libthread_db to know how to do THREAD_SELF. */ +# define DB_THREAD_SELF \ + REGISTER_THREAD_AREA (32, offsetof (struct user_regs_struct, xgs), 3) \ + REGISTER_THREAD_AREA (64, 26 * 8, 3) /* x86-64's user_regs_struct->gs */ + + +/* Read member of the thread descriptor directly. */ +# define THREAD_GETMEM(descr, member) \ + ({ __typeof (descr->member) __value; \ + if (sizeof (__value) == 1) \ + asm volatile ("movb %%gs:%P2,%b0" \ + : "=q" (__value) \ + : "0" (0), "i" (offsetof (struct pthread, member))); \ + else if (sizeof (__value) == 4) \ + asm volatile ("movl %%gs:%P1,%0" \ + : "=r" (__value) \ + : "i" (offsetof (struct pthread, member))); \ + else \ + { \ + if (sizeof (__value) != 8) \ + /* There should not be any value with a size other than 1, \ + 4 or 8. */ \ + abort (); \ + \ + asm volatile ("movl %%gs:%P1,%%eax\n\t" \ + "movl %%gs:%P2,%%edx" \ + : "=A" (__value) \ + : "i" (offsetof (struct pthread, member)), \ + "i" (offsetof (struct pthread, member) + 4)); \ + } \ + __value; }) + + +/* Same as THREAD_GETMEM, but the member offset can be non-constant. */ +# define THREAD_GETMEM_NC(descr, member, idx) \ + ({ __typeof (descr->member[0]) __value; \ + if (sizeof (__value) == 1) \ + asm volatile ("movb %%gs:%P2(%3),%b0" \ + : "=q" (__value) \ + : "0" (0), "i" (offsetof (struct pthread, member[0])), \ + "r" (idx)); \ + else if (sizeof (__value) == 4) \ + asm volatile ("movl %%gs:%P1(,%2,4),%0" \ + : "=r" (__value) \ + : "i" (offsetof (struct pthread, member[0])), \ + "r" (idx)); \ + else \ + { \ + if (sizeof (__value) != 8) \ + /* There should not be any value with a size other than 1, \ + 4 or 8. */ \ + abort (); \ + \ + asm volatile ("movl %%gs:%P1(,%2,8),%%eax\n\t" \ + "movl %%gs:4+%P1(,%2,8),%%edx" \ + : "=&A" (__value) \ + : "i" (offsetof (struct pthread, member[0])), \ + "r" (idx)); \ + } \ + __value; }) + + + +/* Set member of the thread descriptor directly. */ +# define THREAD_SETMEM(descr, member, value) \ + ({ if (sizeof (descr->member) == 1) \ + asm volatile ("movb %b0,%%gs:%P1" : \ + : "iq" (value), \ + "i" (offsetof (struct pthread, member))); \ + else if (sizeof (descr->member) == 4) \ + asm volatile ("movl %0,%%gs:%P1" : \ + : "ir" (value), \ + "i" (offsetof (struct pthread, member))); \ + else \ + { \ + if (sizeof (descr->member) != 8) \ + /* There should not be any value with a size other than 1, \ + 4 or 8. */ \ + abort (); \ + \ + asm volatile ("movl %%eax,%%gs:%P1\n\t" \ + "movl %%edx,%%gs:%P2" : \ + : "A" ((uint64_t) cast_to_integer (value)), \ + "i" (offsetof (struct pthread, member)), \ + "i" (offsetof (struct pthread, member) + 4)); \ + }}) + + +/* Same as THREAD_SETMEM, but the member offset can be non-constant. */ +# define THREAD_SETMEM_NC(descr, member, idx, value) \ + ({ if (sizeof (descr->member[0]) == 1) \ + asm volatile ("movb %b0,%%gs:%P1(%2)" : \ + : "iq" (value), \ + "i" (offsetof (struct pthread, member)), \ + "r" (idx)); \ + else if (sizeof (descr->member[0]) == 4) \ + asm volatile ("movl %0,%%gs:%P1(,%2,4)" : \ + : "ir" (value), \ + "i" (offsetof (struct pthread, member)), \ + "r" (idx)); \ + else \ + { \ + if (sizeof (descr->member[0]) != 8) \ + /* There should not be any value with a size other than 1, \ + 4 or 8. */ \ + abort (); \ + \ + asm volatile ("movl %%eax,%%gs:%P1(,%2,8)\n\t" \ + "movl %%edx,%%gs:4+%P1(,%2,8)" : \ + : "A" ((uint64_t) cast_to_integer (value)), \ + "i" (offsetof (struct pthread, member)), \ + "r" (idx)); \ + }}) + + +/* Atomic compare and exchange on TLS, returning old value. */ +#define THREAD_ATOMIC_CMPXCHG_VAL(descr, member, newval, oldval) \ + ({ __typeof (descr->member) __ret; \ + __typeof (oldval) __old = (oldval); \ + if (sizeof (descr->member) == 4) \ + asm volatile (LOCK_PREFIX "cmpxchgl %2, %%gs:%P3" \ + : "=a" (__ret) \ + : "0" (__old), "r" (newval), \ + "i" (offsetof (struct pthread, member))); \ + else \ + /* Not necessary for other sizes in the moment. */ \ + abort (); \ + __ret; }) + + +/* Atomic logical and. */ +#define THREAD_ATOMIC_AND(descr, member, val) \ + (void) ({ if (sizeof ((descr)->member) == 4) \ + asm volatile (LOCK_PREFIX "andl %1, %%gs:%P0" \ + :: "i" (offsetof (struct pthread, member)), \ + "ir" (val)); \ + else \ + /* Not necessary for other sizes in the moment. */ \ + abort (); }) + + +/* Atomic set bit. */ +#define THREAD_ATOMIC_BIT_SET(descr, member, bit) \ + (void) ({ if (sizeof ((descr)->member) == 4) \ + asm volatile (LOCK_PREFIX "orl %1, %%gs:%P0" \ + :: "i" (offsetof (struct pthread, member)), \ + "ir" (1 << (bit))); \ + else \ + /* Not necessary for other sizes in the moment. */ \ + abort (); }) + + +/* Set the stack guard field in TCB head. */ +#define THREAD_SET_STACK_GUARD(value) \ + THREAD_SETMEM (THREAD_SELF, header.stack_guard, value) +#define THREAD_COPY_STACK_GUARD(descr) \ + ((descr)->header.stack_guard \ + = THREAD_GETMEM (THREAD_SELF, header.stack_guard)) + + +/* Set the pointer guard field in the TCB head. */ +#define THREAD_SET_POINTER_GUARD(value) \ + THREAD_SETMEM (THREAD_SELF, header.pointer_guard, value) +#define THREAD_COPY_POINTER_GUARD(descr) \ + ((descr)->header.pointer_guard \ + = THREAD_GETMEM (THREAD_SELF, header.pointer_guard)) + + +/* Get and set the global scope generation counter in the TCB head. */ +#define THREAD_GSCOPE_FLAG_UNUSED 0 +#define THREAD_GSCOPE_FLAG_USED 1 +#define THREAD_GSCOPE_FLAG_WAIT 2 +#define THREAD_GSCOPE_RESET_FLAG() \ + do \ + { int __res; \ + asm volatile ("xchgl %0, %%gs:%P1" \ + : "=r" (__res) \ + : "i" (offsetof (struct pthread, header.gscope_flag)), \ + "0" (THREAD_GSCOPE_FLAG_UNUSED)); \ + if (__res == THREAD_GSCOPE_FLAG_WAIT) \ + lll_futex_wake (&THREAD_SELF->header.gscope_flag, 1, LLL_PRIVATE); \ + } \ + while (0) +#define THREAD_GSCOPE_SET_FLAG() \ + THREAD_SETMEM (THREAD_SELF, header.gscope_flag, THREAD_GSCOPE_FLAG_USED) +#define THREAD_GSCOPE_WAIT() \ + GL(dl_wait_lookup_done) () + +#endif /* __ASSEMBLER__ */ + +#endif /* tls.h */ diff --git a/REORG.TODO/sysdeps/i386/preconfigure b/REORG.TODO/sysdeps/i386/preconfigure new file mode 100644 index 0000000000..c8fefd1bff --- /dev/null +++ b/REORG.TODO/sysdeps/i386/preconfigure @@ -0,0 +1,5 @@ +# preconfigure fragment for i386. + +case "$machine" in +i[4567]86) base_machine=i386 machine=i386/$machine ;; +esac diff --git a/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S b/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S new file mode 100644 index 0000000000..f71a9fcb2d --- /dev/null +++ b/REORG.TODO/sysdeps/i386/pthread_spin_trylock.S @@ -0,0 +1,46 @@ +/* Copyright (C) 2002-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <pthread-errnos.h> + + +#ifdef UP +# define LOCK +#else +# define LOCK lock +#endif + + .globl pthread_spin_trylock + .type pthread_spin_trylock,@function + .align 16 +pthread_spin_trylock: + movl 4(%esp), %edx + movl $1, %eax + xorl %ecx, %ecx + LOCK + cmpxchgl %ecx, (%edx) + movl $EBUSY, %eax +#ifdef HAVE_CMOV + cmovel %ecx, %eax +#else + jne 0f + movl %ecx, %eax +0: +#endif + ret + .size pthread_spin_trylock,.-pthread_spin_trylock diff --git a/REORG.TODO/sysdeps/i386/rawmemchr.S b/REORG.TODO/sysdeps/i386/rawmemchr.S new file mode 100644 index 0000000000..246ec3f18e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/rawmemchr.S @@ -0,0 +1,222 @@ +/* rawmemchr (str, ch) -- Return pointer to first occurrence of CH in STR. + For Intel 80x86, x>=3. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu> + Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au> + This version is developed using the same algorithm as the fast C + version which carries the following introduction: + Based on strlen implementation by Torbjorn Granlund (tege@sics.se), + with help from Dan Sahlin (dan@sics.se) and + commentary by Jim Blandy (jimb@ai.mit.edu); + adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu), + and implemented by Roland McGrath (roland@ai.mit.edu). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* space for 1 saved reg */ +#define RTN PARMS +#define STR RTN +#define CHR STR+4 + + .text +ENTRY (__rawmemchr) + + /* Save callee-safe register used in this function. */ + pushl %edi + cfi_adjust_cfa_offset (4) + cfi_rel_offset (edi, 0) + + /* Load parameters into registers. */ + movl STR(%esp), %eax + movl CHR(%esp), %edx + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* Now it is 0|0|c|c */ + movl %edx, %ecx + shll $16, %edx /* Now c|c|0|0 */ + movw %cx, %dx /* And finally c|c|c|c */ + + /* Better performance can be achieved if the word (32 + bit) memory access is aligned on a four-byte-boundary. + So process first bytes one by one until boundary is + reached. Don't use a loop for better performance. */ + + testb $3, %al /* correctly aligned ? */ + je L(1) /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L(9) /* target found => return */ + incl %eax /* increment source pointer */ + + testb $3, %al /* correctly aligned ? */ + je L(1) /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L(9) /* target found => return */ + incl %eax /* increment source pointer */ + + testb $3, %al /* correctly aligned ? */ + je L(1) /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L(9) /* target found => return */ + incl %eax /* increment source pointer */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + + /* Each round the main loop processes 16 bytes. */ + ALIGN (4) + +L(1): movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occurred in the last byte => it was 0. */ + jnc L(8) + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %edi. */ + jnz L(8) /* found it => return pointer */ + + /* This process is unfolded four times for better performance. + we don't increment the source pointer each time. Instead we + use offsets and increment by 16 in each run of the loop. But + before probing for the matching byte we need some extra code + (following LL(13) below). Even the len can be compared with + constants instead of decrementing each time. */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(7) /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(7) /* found it => return pointer */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(6) /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(6) /* found it => return pointer */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(5) /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(5) /* found it => return pointer */ + + /* Adjust both counters for a full round, i.e. 16 bytes. */ + addl $16, %eax + jmp L(1) + /* add missing source pointer increments */ +L(5): addl $4, %eax +L(6): addl $4, %eax +L(7): addl $4, %eax + + /* Test for the matching byte in the word. %ecx contains a NUL + char in the byte which originally was the byte we are looking + at. */ +L(8): testb %cl, %cl /* test first byte in dword */ + jz L(9) /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + testb %ch, %ch /* test second byte in dword */ + jz L(9) /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + testl $0xff0000, %ecx /* test third byte in dword */ + jz L(9) /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + /* No further test needed we we know it is one of the four bytes. */ + +L(9): + popl %edi /* pop saved register */ + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__rawmemchr) + +libc_hidden_def (__rawmemchr) +weak_alias (__rawmemchr, rawmemchr) diff --git a/REORG.TODO/sysdeps/i386/rshift.S b/REORG.TODO/sysdeps/i386/rshift.S new file mode 100644 index 0000000000..cf179052b5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/rshift.S @@ -0,0 +1,105 @@ +/* i80386 __mpn_rshift -- + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+12 /* space for 3 saved regs */ +#define RES PARMS +#define S RES+4 +#define SIZE S+4 +#define CNT SIZE+4 + + .text +ENTRY (__mpn_rshift) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebx + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 8) + movl S(%esp),%esi + cfi_rel_offset (esi, 4) + movl SIZE(%esp),%edx + movl CNT(%esp),%ecx + leal -4(%edi,%edx,4),%edi + leal (%esi,%edx,4),%esi + negl %edx + + movl (%esi,%edx,4),%ebx /* read least significant limb */ + cfi_rel_offset (ebx, 0) + cfi_remember_state + xorl %eax,%eax + shrdl %cl,%ebx,%eax /* compute carry limb */ + incl %edx + jz L(end) + pushl %eax /* push carry limb onto stack */ + cfi_adjust_cfa_offset (4) + testb $1,%dl + jnz L(1) /* enter loop in the middle */ + movl %ebx,%eax + + ALIGN (3) +L(oop): movl (%esi,%edx,4),%ebx /* load next higher limb */ + shrdl %cl,%ebx,%eax /* compute result limb */ + movl %eax,(%edi,%edx,4) /* store it */ + incl %edx +L(1): movl (%esi,%edx,4),%eax + shrdl %cl,%eax,%ebx + movl %ebx,(%edi,%edx,4) + incl %edx + jnz L(oop) + + shrl %cl,%eax /* compute most significant limb */ + movl %eax,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + cfi_adjust_cfa_offset (-4) + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret + + cfi_restore_state +L(end): shrl %cl,%ebx /* compute most significant limb */ + movl %ebx,(%edi) /* store it */ + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_rshift) diff --git a/REORG.TODO/sysdeps/i386/setfpucw.c b/REORG.TODO/sysdeps/i386/setfpucw.c new file mode 100644 index 0000000000..40b995f18a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/setfpucw.c @@ -0,0 +1,54 @@ +/* Set the FPU control word for x86. + Copyright (C) 2003-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <math.h> +#include <fpu_control.h> +#include <fenv.h> +#include <unistd.h> +#include <ldsodefs.h> +#include <dl-procinfo.h> + +void +__setfpucw (fpu_control_t set) +{ + fpu_control_t cw; + + /* Fetch the current control word. */ + __asm__ ("fnstcw %0" : "=m" (*&cw)); + + /* Preserve the reserved bits, and set the rest as the user + specified (or the default, if the user gave zero). */ + cw &= _FPU_RESERVED; + cw |= set & ~_FPU_RESERVED; + + __asm__ ("fldcw %0" : : "m" (*&cw)); + + /* If the CPU supports SSE, we set the MXCSR as well. */ + if (HAS_CPU_FEATURE (SSE)) + { + unsigned int xnew_exc; + + /* Get the current MXCSR. */ + __asm__ ("stmxcsr %0" : "=m" (*&xnew_exc)); + + xnew_exc &= ~((0xc00 << 3) | (FE_ALL_EXCEPT << 7)); + xnew_exc |= ((set & 0xc00) << 3) | ((set & FE_ALL_EXCEPT) << 7); + + __asm__ ("ldmxcsr %0" : : "m" (*&xnew_exc)); + } +} diff --git a/REORG.TODO/sysdeps/i386/setjmp.S b/REORG.TODO/sysdeps/i386/setjmp.S new file mode 100644 index 0000000000..738a899e8b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/setjmp.S @@ -0,0 +1,58 @@ +/* setjmp for i386. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <jmpbuf-offsets.h> +#include <asm-syntax.h> +#include <stap-probe.h> + +#define PARMS 4 /* no space for saved regs */ +#define JMPBUF PARMS +#define SIGMSK JMPBUF+4 + +ENTRY (__sigsetjmp) + + movl JMPBUF(%esp), %eax + + /* Save registers. */ + movl %ebx, (JB_BX*4)(%eax) + movl %esi, (JB_SI*4)(%eax) + movl %edi, (JB_DI*4)(%eax) + leal JMPBUF(%esp), %ecx /* Save SP as it will be after we return. */ +#ifdef PTR_MANGLE + PTR_MANGLE (%ecx) +#endif + movl %ecx, (JB_SP*4)(%eax) + movl 0(%esp), %ecx /* Save PC we are returning to now. */ + LIBC_PROBE (setjmp, 3, 4@%eax, -4@SIGMSK(%esp), 4@%ecx) +#ifdef PTR_MANGLE + PTR_MANGLE (%ecx) +#endif + movl %ecx, (JB_PC*4)(%eax) + movl %ebp, (JB_BP*4)(%eax) /* Save caller's frame pointer. */ + +#if IS_IN (rtld) + /* In ld.so we never save the signal mask. */ + xorl %eax, %eax + ret +#else + /* Make a tail call to __sigjmp_save; it takes the same args. */ + jmp __sigjmp_save +#endif +END (__sigsetjmp) +hidden_def (__sigsetjmp) diff --git a/REORG.TODO/sysdeps/i386/stackguard-macros.h b/REORG.TODO/sysdeps/i386/stackguard-macros.h new file mode 100644 index 0000000000..039762927c --- /dev/null +++ b/REORG.TODO/sysdeps/i386/stackguard-macros.h @@ -0,0 +1,12 @@ +#include <stdint.h> + +#define STACK_CHK_GUARD \ + ({ uintptr_t x; asm ("movl %%gs:0x14, %0" : "=r" (x)); x; }) + +#define POINTER_CHK_GUARD \ + ({ \ + uintptr_t x; \ + asm ("movl %%gs:%c1, %0" : "=r" (x) \ + : "i" (offsetof (tcbhead_t, pointer_guard))); \ + x; \ + }) diff --git a/REORG.TODO/sysdeps/i386/stackinfo.h b/REORG.TODO/sysdeps/i386/stackinfo.h new file mode 100644 index 0000000000..ba17867d3a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/stackinfo.h @@ -0,0 +1,43 @@ +/* Copyright (C) 1999-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* This file contains a bit of information about the stack allocation + of the processor. */ + +#ifndef _STACKINFO_H +#define _STACKINFO_H 1 + +#include <elf.h> + +/* On x86 the stack grows down. */ +#define _STACK_GROWS_DOWN 1 + +/* Default to an executable stack. PF_X can be overridden if PT_GNU_STACK is + * present, but it is presumed absent. */ +#define DEFAULT_STACK_PERMS (PF_R|PF_W|PF_X) + +/* Access to the stack pointer. The macros are used in alloca_account + for which they need to act as barriers as well, hence the additional + (unnecessary) parameters. */ +#define stackinfo_get_sp() \ + ({ void *p__; asm volatile ("mov %%esp, %0" : "=r" (p__)); p__; }) +#define stackinfo_sub_sp(ptr) \ + ({ ptrdiff_t d__; \ + asm volatile ("sub %%esp, %0" : "=r" (d__) : "0" (ptr)); \ + d__; }) + +#endif /* stackinfo.h */ diff --git a/REORG.TODO/sysdeps/i386/start.S b/REORG.TODO/sysdeps/i386/start.S new file mode 100644 index 0000000000..ccb1e2b38f --- /dev/null +++ b/REORG.TODO/sysdeps/i386/start.S @@ -0,0 +1,139 @@ +/* Startup code compliant to the ELF i386 ABI. + Copyright (C) 1995-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + In addition to the permissions in the GNU Lesser General Public + License, the Free Software Foundation gives you unlimited + permission to link the compiled version of this file with other + programs, and to distribute those programs without any restriction + coming from the use of this file. (The GNU Lesser General Public + License restrictions do apply in other respects; for example, they + cover modification of the file, and distribution when not linked + into another program.) + + Note that people who make modified versions of this file are not + obligated to grant this special exception for their modified + versions; it is their choice whether to do so. The GNU Lesser + General Public License gives permission to release a modified + version without this exception; this exception also makes it + possible to release a modified version which carries forward this + exception. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* This is the canonical entry point, usually the first thing in the text + segment. The SVR4/i386 ABI (pages 3-31, 3-32) says that when the entry + point runs, most registers' values are unspecified, except for: + + %edx Contains a function pointer to be registered with `atexit'. + This is how the dynamic linker arranges to have DT_FINI + functions called for shared libraries that have been loaded + before this code runs. + + %esp The stack contains the arguments and environment: + 0(%esp) argc + 4(%esp) argv[0] + ... + (4*argc)(%esp) NULL + (4*(argc+1))(%esp) envp[0] + ... + NULL +*/ + + .text + .globl _start + .type _start,@function +_start: + /* Clear the frame pointer. The ABI suggests this be done, to mark + the outermost frame obviously. */ + xorl %ebp, %ebp + + /* Extract the arguments as encoded on the stack and set up + the arguments for `main': argc, argv. envp will be determined + later in __libc_start_main. */ + popl %esi /* Pop the argument count. */ + movl %esp, %ecx /* argv starts just at the current stack top.*/ + + /* Before pushing the arguments align the stack to a 16-byte + (SSE needs 16-byte alignment) boundary to avoid penalties from + misaligned accesses. Thanks to Edward Seidl <seidl@janed.com> + for pointing this out. */ + andl $0xfffffff0, %esp + pushl %eax /* Push garbage because we allocate + 28 more bytes. */ + + /* Provide the highest stack address to the user code (for stacks + which grow downwards). */ + pushl %esp + + pushl %edx /* Push address of the shared library + termination function. */ + +#ifdef SHARED + /* Load PIC register. */ + call 1f + addl $_GLOBAL_OFFSET_TABLE_, %ebx + + /* Push address of our own entry points to .fini and .init. */ + leal __libc_csu_fini@GOTOFF(%ebx), %eax + pushl %eax + leal __libc_csu_init@GOTOFF(%ebx), %eax + pushl %eax + + pushl %ecx /* Push second argument: argv. */ + pushl %esi /* Push first argument: argc. */ + + pushl main@GOT(%ebx) + + /* Call the user's main function, and exit with its value. + But let the libc call main. */ + call __libc_start_main@PLT +#else + /* Push address of our own entry points to .fini and .init. */ + pushl $__libc_csu_fini + pushl $__libc_csu_init + + pushl %ecx /* Push second argument: argv. */ + pushl %esi /* Push first argument: argc. */ + + pushl $main + + /* Call the user's main function, and exit with its value. + But let the libc call main. */ + call __libc_start_main +#endif + + hlt /* Crash if somehow `exit' does return. */ + +#ifdef SHARED +1: movl (%esp), %ebx + ret +#endif + +/* To fulfill the System V/i386 ABI we need this symbol. Yuck, it's so + meaningless since we don't support machines < 80386. */ + .section .rodata + .globl _fp_hw +_fp_hw: .long 3 + .size _fp_hw, 4 + .type _fp_hw,@object + +/* Define a symbol for the first piece of initialized data. */ + .data + .globl __data_start +__data_start: + .long 0 + .weak data_start + data_start = __data_start diff --git a/REORG.TODO/sysdeps/i386/stpcpy.S b/REORG.TODO/sysdeps/i386/stpcpy.S new file mode 100644 index 0000000000..d9981b677b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/stpcpy.S @@ -0,0 +1,88 @@ +/* Copy SRC to DEST returning the address of the terminating '\0' in DEST. + For Intel 80x86, x>=3. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper (drepper@gnu.ai.mit.edu). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* This function is defined neither in ANSI nor POSIX standards but is + also not invented here. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4 /* no space for saved regs */ +#define RTN PARMS +#define DEST RTN +#define SRC DEST+4 + + .text +ENTRY (__stpcpy) + + movl DEST(%esp), %eax + movl SRC(%esp), %ecx + subl %eax, %ecx /* magic: reduce number of loop variants + to one using addressing mode */ + + /* Here we would like to write + + subl $4, %eax + ALIGN (4) + + but the assembler is too smart and optimizes for the shortest + form where the number only needs one byte. But if we could + have the long form we would not need the alignment. */ + + .byte 0x81, 0xe8 /* This is `subl $0x00000004, %eax' */ + .long 0x00000004 + + /* Four times unfolded loop with only one loop counter. This + is achieved by the use of index+base addressing mode. As the + loop counter we use the destination address because this is + also the result. */ +L(1): addl $4, %eax /* increment loop counter */ + + movb (%eax,%ecx), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L(2) /* yes, then exit */ + + movb 1(%eax,%ecx), %dl /* load current char */ + movb %dl, 1(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L(3) /* yes, then exit */ + + movb 2(%eax,%ecx), %dl /* load current char */ + movb %dl, 2(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L(4) /* yes, then exit */ + + movb 3(%eax,%ecx), %dl /* load current char */ + movb %dl, 3(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jnz L(1) /* no, then continue loop */ + + incl %eax /* correct loop counter */ +L(4): incl %eax +L(3): incl %eax +L(2): + + ret +END (__stpcpy) + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/REORG.TODO/sysdeps/i386/stpncpy.S b/REORG.TODO/sysdeps/i386/stpncpy.S new file mode 100644 index 0000000000..46f2aba713 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/stpncpy.S @@ -0,0 +1,147 @@ +/* copy no more than N bytes from SRC to DEST, returning the address of + the terminating '\0' in DEST. + For Intel 80x86, x>=3. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu> + Some bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au> + - original wrote n+1 chars in some cases. + - stpncpy() ought to behave like strncpy() ie. not null-terminate + if limited by n. glibc-1.09 stpncpy() does this. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* space for 1 saved reg */ +#define RTN PARMS +#define DEST RTN +#define SRC DEST+4 +#define LEN SRC+4 + + .text +ENTRY (__stpncpy) + + pushl %esi + cfi_adjust_cfa_offset (4) + + movl DEST(%esp), %eax + movl SRC(%esp), %esi + cfi_rel_offset (esi, 0) + movl LEN(%esp), %ecx + + subl %eax, %esi /* magic: reduce number of loop variants + to one using addressing mode */ + jmp L(1) /* jump to loop "head" */ + + ALIGN(4) + + /* Four times unfolded loop with two loop counters. We get the + third value (the source address) by using the index+base + addressing mode. */ +L(2): movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L(7) /* yes, then exit */ + + movb 1(%eax,%esi), %dl /* load current char */ + movb %dl, 1(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L(6) /* yes, then exit */ + + movb 2(%eax,%esi), %dl /* load current char */ + movb %dl, 2(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L(5) /* yes, then exit */ + + movb 3(%eax,%esi), %dl /* load current char */ + movb %dl, 3(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L(4) /* yes, then exit */ + + addl $4, %eax /* increment loop counter for full round */ + +L(1): subl $4, %ecx /* still more than 4 bytes allowed? */ + jae L(2) /* yes, then go to start of loop */ + + /* The maximal remaining 15 bytes are not processed in a loop. */ + + addl $4, %ecx /* correct above subtraction */ + jz L(9) /* maximal allowed char reached => go to end */ + + movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L(3) /* yes, then exit */ + + incl %eax /* increment pointer */ + decl %ecx /* decrement length counter */ + jz L(9) /* no more allowed => exit */ + + movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L(3) /* yes, then exit */ + + incl %eax /* increment pointer */ + decl %ecx /* decrement length counter */ + jz L(9) /* no more allowed => exit */ + + movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L(3) /* yes, then exit */ + + incl %eax /* increment pointer */ + jmp L(9) /* we don't have to test for counter underflow + because we know we had a most 3 bytes + remaining => exit */ + + /* When coming from the main loop we have to adjust the pointer. */ +L(4): decl %ecx /* decrement counter */ + incl %eax /* increment pointer */ + +L(5): decl %ecx /* increment pointer */ + incl %eax /* increment pointer */ + +L(6): decl %ecx /* increment pointer */ + incl %eax /* increment pointer */ +L(7): + + addl $3, %ecx /* correct pre-decrementation of counter + at the beginning of the loop; but why 3 + and not 4? Very simple, we have to count + the NUL char we already wrote. */ + jz L(9) /* counter is also 0 => exit */ + + /* We now have to fill the rest of the buffer with NUL. This + is done in a tricky way. Please note that the addressing mode + used below is not the same we used above. Here we use the + %ecx register. */ +L(8): + movb $0, (%ecx,%eax) /* store NUL char */ +L(3): decl %ecx /* all bytes written? */ + jnz L(8) /* no, then again */ + +L(9): popl %esi /* restore saved register content */ + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + + ret +END (__stpncpy) + +libc_hidden_def (__stpncpy) +weak_alias (__stpncpy, stpncpy) diff --git a/REORG.TODO/sysdeps/i386/strcat.S b/REORG.TODO/sysdeps/i386/strcat.S new file mode 100644 index 0000000000..4a26b3c528 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/strcat.S @@ -0,0 +1,265 @@ +/* strcat(dest, src) -- Append SRC on the end of DEST. + For Intel 80x86, x>=4. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@ipd.info.uni-karlsruhe.de>. + Optimised a little by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au> + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* space for 1 saved reg */ +#define RTN PARMS +#define DEST RTN +#define SRC DEST+4 + + .text +ENTRY (strcat) + + pushl %edi /* Save callee-safe register. */ + cfi_adjust_cfa_offset (4) + + movl DEST(%esp), %edx + movl SRC(%esp), %ecx + + testb $0xff, (%ecx) /* Is source string empty? */ + jz L(8) /* yes => return */ + + /* Test the first bytes separately until destination is aligned. */ + testl $3, %edx /* destination pointer aligned? */ + jz L(1) /* yes => begin scan loop */ + testb $0xff, (%edx) /* is end of string? */ + jz L(2) /* yes => start appending */ + incl %edx /* increment source pointer */ + + testl $3, %edx /* destination pointer aligned? */ + jz L(1) /* yes => begin scan loop */ + testb $0xff, (%edx) /* is end of string? */ + jz L(2) /* yes => start appending */ + incl %edx /* increment source pointer */ + + testl $3, %edx /* destination pointer aligned? */ + jz L(1) /* yes => begin scan loop */ + testb $0xff, (%edx) /* is end of string? */ + jz L(2) /* yes => start appending */ + incl %edx /* increment source pointer */ + + /* Now we are aligned. Begin scan loop. */ + jmp L(1) + + cfi_rel_offset (edi, 0) + ALIGN(4) + +L(4): addl $16,%edx /* increment destination pointer for round */ + +L(1): movl (%edx), %eax /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + + /* If you compare this with the algorithm in memchr.S you will + notice that here is an `xorl' statement missing. But you must + not forget that we are looking for C == 0 and `xorl $0, %eax' + is a no-op. */ + + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occurred in the last byte => it was 0. */ + jnc L(3) + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %ecx. */ + jnz L(3) + + movl 4(%edx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(5) /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(5) /* one byte is NUL => stop copying */ + + movl 8(%edx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(6) /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(6) /* one byte is NUL => stop copying */ + + movl 12(%edx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(7) /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L(4) /* no byte is NUL => carry on copying */ + +L(7): addl $4, %edx /* adjust source pointer */ +L(6): addl $4, %edx +L(5): addl $4, %edx + +L(3): testb %al, %al /* is first byte NUL? */ + jz L(2) /* yes => start copying */ + incl %edx /* increment source pointer */ + + testb %ah, %ah /* is second byte NUL? */ + jz L(2) /* yes => start copying */ + incl %edx /* increment source pointer */ + + testl $0xff0000, %eax /* is third byte NUL? */ + jz L(2) /* yes => start copying */ + incl %edx /* increment source pointer */ + +L(2): subl %ecx, %edx /* reduce number of loop variants */ + + /* Now we have to align the source pointer. */ + testl $3, %ecx /* pointer correctly aligned? */ + jz L(29) /* yes => start copy loop */ + movb (%ecx), %al /* get first byte */ + movb %al, (%ecx,%edx) /* and store it */ + andb %al, %al /* is byte NUL? */ + jz L(8) /* yes => return */ + incl %ecx /* increment pointer */ + + testl $3, %ecx /* pointer correctly aligned? */ + jz L(29) /* yes => start copy loop */ + movb (%ecx), %al /* get first byte */ + movb %al, (%ecx,%edx) /* and store it */ + andb %al, %al /* is byte NUL? */ + jz L(8) /* yes => return */ + incl %ecx /* increment pointer */ + + testl $3, %ecx /* pointer correctly aligned? */ + jz L(29) /* yes => start copy loop */ + movb (%ecx), %al /* get first byte */ + movb %al, (%ecx,%edx) /* and store it */ + andb %al, %al /* is byte NUL? */ + jz L(8) /* yes => return */ + incl %ecx /* increment pointer */ + + /* Now we are aligned. */ + jmp L(29) /* start copy loop */ + + ALIGN(4) + +L(28): movl %eax, 12(%ecx,%edx)/* store word at destination */ + addl $16, %ecx /* adjust pointer for full round */ + +L(29): movl (%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(9) /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(9) /* one byte is NUL => stop copying */ + movl %eax, (%ecx,%edx) /* store word to destination */ + + movl 4(%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(91) /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(91) /* one byte is NUL => stop copying */ + movl %eax, 4(%ecx,%edx) /* store word to destination */ + + movl 8(%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(92) /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(92) /* one byte is NUL => stop copying */ + movl %eax, 8(%ecx,%edx) /* store word to destination */ + + movl 12(%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(93) /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L(28) /* no is NUL => carry on copying */ + +L(93): addl $4, %ecx /* adjust pointer */ +L(92): addl $4, %ecx +L(91): addl $4, %ecx + +L(9): movb %al, (%ecx,%edx) /* store first byte of last word */ + orb %al, %al /* is it NUL? */ + jz L(8) /* yes => return */ + + movb %ah, 1(%ecx,%edx) /* store second byte of last word */ + orb %ah, %ah /* is it NUL? */ + jz L(8) /* yes => return */ + + shrl $16, %eax /* make upper bytes accessible */ + movb %al, 2(%ecx,%edx) /* store third byte of last word */ + orb %al, %al /* is it NUL? */ + jz L(8) /* yes => return */ + + movb %ah, 3(%ecx,%edx) /* store fourth byte of last word */ + +L(8): movl DEST(%esp), %eax /* start address of destination is result */ + popl %edi /* restore saved register */ + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (strcat) +libc_hidden_builtin_def (strcat) diff --git a/REORG.TODO/sysdeps/i386/strchr.S b/REORG.TODO/sysdeps/i386/strchr.S new file mode 100644 index 0000000000..6075e77882 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/strchr.S @@ -0,0 +1,290 @@ +/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. + For Intel 80x86, x>=3. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu> + Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au> + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* space for 1 saved reg */ +#define RTN PARMS +#define STR RTN +#define CHR STR+4 + + .text +ENTRY (strchr) + + pushl %edi /* Save callee-safe registers used here. */ + cfi_adjust_cfa_offset (4) + cfi_rel_offset (edi, 0) + movl STR(%esp), %eax + movl CHR(%esp), %edx + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* now it is 0|0|c|c */ + movl %edx, %ecx + shll $16, %edx /* now it is c|c|0|0 */ + movw %cx, %dx /* and finally c|c|c|c */ + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 32-bit memory access is faster + and (more important) + 2. we process in the main loop 32 bit in one step although + we don't know the end of the string. But accessing at + 4-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherent + boundaries are multiples of 4. */ + + testb $3, %al /* correctly aligned ? */ + jz L(11) /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L(6) /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L(2) /* yes => return NULL */ + incl %eax /* increment pointer */ + + testb $3, %al /* correctly aligned ? */ + jz L(11) /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L(6) /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L(2) /* yes => return NULL */ + incl %eax /* increment pointer */ + + testb $3, %al /* correctly aligned ? */ + jz L(11) /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L(6) /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L(2) /* yes => return NULL */ + incl %eax /* increment pointer */ + + /* No we have reached alignment. */ + jmp L(11) /* begin loop */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + /* Each round the main loop processes 16 bytes. */ + + ALIGN(4) + +L(1): addl $16, %eax /* adjust pointer for whole round */ + +L(11): movl (%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occurred in the last byte => it was 0. */ + jnc L(7) + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %edi. */ + jnz L(7) /* found it => return pointer */ + + /* Now we made sure the dword does not contain the character we are + looking for. But because we deal with strings we have to check + for the end of string before testing the next dword. */ + + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(2) /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(2) /* found NUL => return NULL */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + jnc L(71) /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(71) /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(2) /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(2) /* found NUL => return NULL */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + jnc L(72) /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(72) /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(2) /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(2) /* found NUL => return NULL */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + jnc L(73) /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(73) /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(2) /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L(1) /* no NUL found => restart loop */ + +L(2): /* Return NULL. */ + xorl %eax, %eax + popl %edi /* restore saved register content */ + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret + + cfi_adjust_cfa_offset (4) + cfi_rel_offset (edi, 0) +L(73): addl $4, %eax /* adjust pointer */ +L(72): addl $4, %eax +L(71): addl $4, %eax + + /* We now scan for the byte in which the character was matched. + But we have to take care of the case that a NUL char is + found before this in the dword. Note that we XORed %ecx + with the byte we're looking for, therefore the tests below look + reversed. */ + +L(7): testb %cl, %cl /* is first byte C? */ + jz L(6) /* yes => return pointer */ + cmpb %dl, %cl /* is first byte NUL? */ + je L(2) /* yes => return NULL */ + incl %eax /* it's not in the first byte */ + + testb %ch, %ch /* is second byte C? */ + jz L(6) /* yes => return pointer */ + cmpb %dl, %ch /* is second byte NUL? */ + je L(2) /* yes => return NULL? */ + incl %eax /* it's not in the second byte */ + + shrl $16, %ecx /* make upper byte accessible */ + testb %cl, %cl /* is third byte C? */ + jz L(6) /* yes => return pointer */ + cmpb %dl, %cl /* is third byte NUL? */ + je L(2) /* yes => return NULL */ + + /* It must be in the fourth byte and it cannot be NUL. */ + incl %eax + +L(6): + popl %edi /* restore saved register content */ + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (strchr) + +weak_alias (strchr, index) +libc_hidden_builtin_def (strchr) diff --git a/REORG.TODO/sysdeps/i386/strchrnul.S b/REORG.TODO/sysdeps/i386/strchrnul.S new file mode 100644 index 0000000000..800b872c74 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/strchrnul.S @@ -0,0 +1,278 @@ +/* strchrnul (str, chr) -- Return pointer to first occurrence of CHR in STR + or the final NUL byte. + For Intel 80x86, x>=3. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.org> + Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au> + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+4 /* space for 1 saved reg */ +#define RTN PARMS +#define STR RTN +#define CHR STR+4 + + .text +ENTRY (__strchrnul) + + pushl %edi /* Save callee-safe registers used here. */ + cfi_adjust_cfa_offset (4) + cfi_rel_offset (edi, 0) + + movl STR(%esp), %eax + movl CHR(%esp), %edx + + /* At the moment %edx contains CHR. What we need for the + algorithm is CHR in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* now it is 0|0|c|c */ + movl %edx, %ecx + shll $16, %edx /* now it is c|c|0|0 */ + movw %cx, %dx /* and finally c|c|c|c */ + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 32-bit memory access is faster + and (more important) + 2. we process in the main loop 32 bit in one step although + we don't know the end of the string. But accessing at + 4-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherent + boundaries are multiples of 4. */ + + testb $3, %al /* correctly aligned ? */ + jz L(11) /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L(6) /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L(6) /* yes => return NULL */ + incl %eax /* increment pointer */ + + testb $3, %al /* correctly aligned ? */ + jz L(11) /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L(6) /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L(6) /* yes => return NULL */ + incl %eax /* increment pointer */ + + testb $3, %al /* correctly aligned ? */ + jz L(11) /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L(6) /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L(6) /* yes => return NULL */ + incl %eax /* increment pointer */ + + /* No we have reached alignment. */ + jmp L(11) /* begin loop */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for CHR, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is CHR. This turns each byte that is CHR + into a zero. */ + + /* Each round the main loop processes 16 bytes. */ + + ALIGN(4) + +L(1): addl $16, %eax /* adjust pointer for whole round */ + +L(11): movl (%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* CHR */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occurred in the last byte => it was 0. */ + jnc L(7) + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is CHR we don't get 0 in %edi. */ + jnz L(7) /* found it => return pointer */ + + /* Now we made sure the dword does not contain the character we are + looking for. But because we deal with strings we have to check + for the end of string before testing the next dword. */ + + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(7) /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(7) /* found NUL => return NULL */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* CHR */ + jnc L(71) /* highest byte is CHR => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(71) /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(71) /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(71) /* found NUL => return NULL */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* CHR */ + jnc L(72) /* highest byte is CHR => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(72) /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(72) /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(72) /* found NUL => return NULL */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* CHR */ + jnc L(73) /* highest byte is CHR => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(73) /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(73) /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L(1) /* no NUL found => restart loop */ + +L(73): addl $4, %eax /* adjust pointer */ +L(72): addl $4, %eax +L(71): addl $4, %eax + + /* We now scan for the byte in which the character was matched. + But we have to take care of the case that a NUL char is + found before this in the dword. */ + +L(7): testb %cl, %cl /* is first byte CHR? */ + jz L(6) /* yes => return pointer */ + cmpb %dl, %cl /* is first byte NUL? */ + je L(6) /* yes => return NULL */ + incl %eax /* it's not in the first byte */ + + testb %ch, %ch /* is second byte CHR? */ + jz L(6) /* yes => return pointer */ + cmpb %dl, %ch /* is second byte NUL? */ + je L(6) /* yes => return NULL? */ + incl %eax /* it's not in the second byte */ + + shrl $16, %ecx /* make upper byte accessible */ + testb %cl, %cl /* is third byte CHR? */ + jz L(6) /* yes => return pointer */ + cmpb %dl, %cl /* is third byte NUL? */ + je L(6) /* yes => return NULL */ + + /* It must be in the fourth byte and it cannot be NUL. */ + incl %eax + +L(6): popl %edi /* restore saved register content */ + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__strchrnul) + +weak_alias (__strchrnul, strchrnul) diff --git a/REORG.TODO/sysdeps/i386/strcspn.S b/REORG.TODO/sysdeps/i386/strcspn.S new file mode 100644 index 0000000000..c852a3b1e5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/strcspn.S @@ -0,0 +1,240 @@ +/* strcspn (str, ss) -- Return the length of the initial segment of STR + which contains no characters from SS. + For Intel 80x86, x>=3. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu> + Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au> + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4 /* no space for saved regs */ +#define STR PARMS +#define STOP STR+4 + + .text +ENTRY (strcspn) + + movl STR(%esp), %edx + movl STOP(%esp), %eax + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. The unrolled form is much faster than a loop. */ + xorl %ecx, %ecx /* %ecx = 0 !!! */ + + pushl %ecx /* make a 256 bytes long block filled with 0 */ + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl $0 /* These immediate values make the label 2 */ + cfi_adjust_cfa_offset (4) + pushl $0 /* to be aligned on a 16 byte boundary to */ + cfi_adjust_cfa_offset (4) + pushl $0 /* get a better performance of the loop. */ + cfi_adjust_cfa_offset (4) + pushl $0 + cfi_adjust_cfa_offset (4) + pushl $0 + cfi_adjust_cfa_offset (4) + pushl $0 + cfi_adjust_cfa_offset (4) + +/* For understanding the following code remember that %ecx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 32-bit value in %ecx. */ + +/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want + longer instructions so that the next loop aligns without adding nops. */ + +L(2): movb (%eax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 1(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 2(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 3(%eax), %cl /* get byte from stopset */ + addl $4, %eax /* increment stopset pointer */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L(2) /* no => process next dword from stopset */ + +L(1): leal -4(%edx), %eax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the character is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + +L(3): addl $4, %eax /* adjust pointer for full loop round */ + + movb (%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L(4) /* yes => return */ + + movb 1(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L(5) /* yes => return */ + + movb 2(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L(6) /* yes => return */ + + movb 3(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + jne L(3) /* yes => return */ + + incl %eax /* adjust pointer */ +L(6): incl %eax +L(5): incl %eax + +L(4): addl $256, %esp /* remove stopset */ + cfi_adjust_cfa_offset (-256) + subl %edx, %eax /* we have to return the number of valid + characters, so compute distance to first + non-valid character */ + ret +END (strcspn) +libc_hidden_builtin_def (strcspn) diff --git a/REORG.TODO/sysdeps/i386/string-inlines.c b/REORG.TODO/sysdeps/i386/string-inlines.c new file mode 100644 index 0000000000..d023bc3aa3 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/string-inlines.c @@ -0,0 +1,47 @@ +/* Copyright (C) 1999-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* This is to avoid PLT entries for the x86 version. */ +#define __memcpy_g __memcpy_g_internal +#define __strchr_g __strchr_g_internal +#include <string/string-inlines.c> + +void * +(__memcpy_c) (void *d, const void *s, size_t n) +{ + return memcpy (d, s, n); +} + +void * +__memset_cc (void *s, unsigned long int pattern, size_t n) +{ + return memset (s, pattern & 0xff, n); +} +strong_alias (__memset_cc, __memset_cg) + +void * +__memset_gg (void *s, char c, size_t n) +{ + return memset (s, c, n); +} + +#ifdef __memcpy_c +# undef __memcpy_g +strong_alias (__memcpy_g_internal, __memcpy_g) +# undef __strchr_g +strong_alias (__strchr_g_internal, __strchr_g) +#endif diff --git a/REORG.TODO/sysdeps/i386/strlen.S b/REORG.TODO/sysdeps/i386/strlen.S new file mode 100644 index 0000000000..192fadf20a --- /dev/null +++ b/REORG.TODO/sysdeps/i386/strlen.S @@ -0,0 +1,132 @@ +/* strlen(str) -- determine the length of the string STR. + Optimized for Intel 80x86, x>=4. + Copyright (C) 1991-2017 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4 /* no space for saved regs */ +#define STR PARMS + + .text +ENTRY (strlen) + + movl STR(%esp), %ecx + movl %ecx, %eax /* duplicate it */ + + andl $3, %ecx /* mask alignment bits */ + jz L(1) /* aligned => start loop */ + cmpb %ch, (%eax) /* is byte NUL? */ + je L(2) /* yes => return */ + incl %eax /* increment pointer */ + + xorl $3, %ecx /* was alignment = 3? */ + jz L(1) /* yes => now it is aligned and start loop */ + cmpb %ch, (%eax) /* is byte NUL? */ + je L(2) /* yes => return */ + addl $1, %eax /* increment pointer */ + + subl $1, %ecx /* was alignment = 2? */ + jz L(1) /* yes => now it is aligned and start loop */ + cmpb %ch, (%eax) /* is byte NUL? */ + je L(2) /* yes => return */ + +/* Don't change the above `addl $1,%eax' and `subl $1, %ecx' into `incl %eax' + and `decl %ecx' resp. The additional two byte per instruction make the + label 4 to be aligned on a 16 byte boundary with nops. + + The following `sub $15, %eax' is part of this trick, too. Together with + the next instruction (`addl $16, %eax') it is in fact a `incl %eax', just + as expected from the algorithm. But doing so has the advantage that + no jump to label 1 is necessary and so the pipeline is not flushed. */ + + subl $15, %eax /* effectively +1 */ + + +L(4): addl $16, %eax /* adjust pointer for full loop */ + +L(1): movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(3) /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(3) /* found NUL => return pointer */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(5) /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(5) /* found NUL => return pointer */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(6) /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(6) /* found NUL => return pointer */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(7) /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L(4) /* no NUL found => continue loop */ + +L(7): addl $4, %eax /* adjust pointer */ +L(6): addl $4, %eax +L(5): addl $4, %eax + +L(3): testb %cl, %cl /* is first byte NUL? */ + jz L(2) /* yes => return */ + incl %eax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz L(2) /* yes => return */ + incl %eax /* increment pointer */ + + testl $0xff0000, %ecx /* is third byte NUL? */ + jz L(2) /* yes => return pointer */ + incl %eax /* increment pointer */ + +L(2): subl STR(%esp), %eax /* compute difference to string start */ + + ret +END (strlen) +libc_hidden_builtin_def (strlen) diff --git a/REORG.TODO/sysdeps/i386/strlen.c b/REORG.TODO/sysdeps/i386/strlen.c new file mode 100644 index 0000000000..0b69957392 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/strlen.c @@ -0,0 +1,35 @@ +/* Determine the length of a string. For Intel 80x86, x>=3. + Copyright (C) 1991-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Torbjorn Granlund (tege@sics.se). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <string.h> + +size_t +strlen (const char *str) +{ + int cnt; + + asm("cld\n" /* Search forward. */ + /* Some old versions of gas need `repne' instead of `repnz'. */ + "repnz\n" /* Look for a zero byte. */ + "scasb" /* %0, %1, %3 */ : + "=c" (cnt) : "D" (str), "0" (-1), "a" (0)); + + return -2 - cnt; +} +libc_hidden_builtin_def (strlen) diff --git a/REORG.TODO/sysdeps/i386/strpbrk.S b/REORG.TODO/sysdeps/i386/strpbrk.S new file mode 100644 index 0000000000..1109b233da --- /dev/null +++ b/REORG.TODO/sysdeps/i386/strpbrk.S @@ -0,0 +1,243 @@ +/* strcspn (str, ss) -- Return the length of the initial segement of STR + which contains no characters from SS. + For Intel 80x86, x>=3. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu> + Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au> + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4 /* no space for saved regs */ +#define RTN PARMS +#define STR RTN +#define STOP STR+4 + + .text +ENTRY (strpbrk) + + movl STR(%esp), %edx + movl STOP(%esp), %eax + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. The unrolled form is much faster than a loop. */ + xorl %ecx, %ecx /* %ecx = 0 !!! */ + + pushl %ecx /* make a 256 bytes long block filled with 0 */ + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl $0 /* These immediate values make the label 2 */ + cfi_adjust_cfa_offset (4) + pushl $0 /* to be aligned on a 16 byte boundary to */ + cfi_adjust_cfa_offset (4) + pushl $0 /* get a better performance of the loop. */ + cfi_adjust_cfa_offset (4) + pushl $0 + cfi_adjust_cfa_offset (4) + pushl $0 + cfi_adjust_cfa_offset (4) + pushl $0 + cfi_adjust_cfa_offset (4) + +/* For understanding the following code remember that %ecx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 32-bit value in %ecx. */ + +/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want + longer instructions so that the next loop aligns without adding nops. */ + +L(2): movb (%eax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 1(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 2(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 3(%eax), %cl /* get byte from stopset */ + addl $4, %eax /* increment stopset pointer */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L(2) /* no => process next dword from stopset */ + +L(1): leal -4(%edx), %eax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the character is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + +L(3): addl $4, %eax /* adjust pointer for full loop round */ + + movb (%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L(4) /* yes => return */ + + movb 1(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L(5) /* yes => return */ + + movb 2(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L(6) /* yes => return */ + + movb 3(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + jne L(3) /* yes => return */ + + incl %eax /* adjust pointer */ +L(6): incl %eax +L(5): incl %eax + +L(4): addl $256, %esp /* remove stopset */ + cfi_adjust_cfa_offset (-256) + + orb %cl, %cl /* was last character NUL? */ + jnz L(7) /* no => return pointer */ + xorl %eax, %eax + +L(7): ret +END (strpbrk) +libc_hidden_builtin_def (strpbrk) diff --git a/REORG.TODO/sysdeps/i386/strrchr.S b/REORG.TODO/sysdeps/i386/strrchr.S new file mode 100644 index 0000000000..95b304dc0b --- /dev/null +++ b/REORG.TODO/sysdeps/i386/strrchr.S @@ -0,0 +1,334 @@ +/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR. + For Intel 80x86, x>=3. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu> + Some optimisations by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au> + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4+8 /* space for 2 saved regs */ +#define RTN PARMS +#define STR RTN +#define CHR STR+4 + + .text +ENTRY (strrchr) + + pushl %edi /* Save callee-safe registers used here. */ + cfi_adjust_cfa_offset (4) + cfi_rel_offset (edi, 0) + pushl %esi + cfi_adjust_cfa_offset (4) + + xorl %eax, %eax + movl STR(%esp), %esi + cfi_rel_offset (esi, 0) + movl CHR(%esp), %ecx + + /* At the moment %ecx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %cl, %ch /* now it is 0|0|c|c */ + movl %ecx, %edx + shll $16, %ecx /* now it is c|c|0|0 */ + movw %dx, %cx /* and finally c|c|c|c */ + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 32-bit memory access is faster + and (more important) + 2. we process in the main loop 32 bit in one step although + we don't know the end of the string. But accessing at + 4-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherent + boundaries are multiples of 4. */ + + testl $3, %esi /* correctly aligned ? */ + jz L(19) /* yes => begin loop */ + movb (%esi), %dl /* load byte in question (we need it twice) */ + cmpb %dl, %cl /* compare byte */ + jne L(11) /* target found => return */ + movl %esi, %eax /* remember pointer as possible result */ +L(11): orb %dl, %dl /* is NUL? */ + jz L(2) /* yes => return NULL */ + incl %esi /* increment pointer */ + + testl $3, %esi /* correctly aligned ? */ + jz L(19) /* yes => begin loop */ + movb (%esi), %dl /* load byte in question (we need it twice) */ + cmpb %dl, %cl /* compare byte */ + jne L(12) /* target found => return */ + movl %esi, %eax /* remember pointer as result */ +L(12): orb %dl, %dl /* is NUL? */ + jz L(2) /* yes => return NULL */ + incl %esi /* increment pointer */ + + testl $3, %esi /* correctly aligned ? */ + jz L(19) /* yes => begin loop */ + movb (%esi), %dl /* load byte in question (we need it twice) */ + cmpb %dl, %cl /* compare byte */ + jne L(13) /* target found => return */ + movl %esi, %eax /* remember pointer as result */ +L(13): orb %dl, %dl /* is NUL? */ + jz L(2) /* yes => return NULL */ + incl %esi /* increment pointer */ + + /* No we have reached alignment. */ + jmp L(19) /* begin loop */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + /* Each round the main loop processes 16 bytes. */ + + /* Jump to here when the character is detected. We chose this + way around because the character one is looking for is not + as frequent as the rest and taking a conditional jump is more + expensive than ignoring it. + + Some more words to the code below: it might not be obvious why + we decrement the source pointer here. In the loop the pointer + is not pre-incremented and so it still points before the word + we are looking at. But you should take a look at the instruction + which gets executed before we get into the loop: `addl $16, %esi'. + This makes the following subs into adds. */ + + /* These fill bytes make the main loop be correctly aligned. + We cannot use align because it is not the following instruction + which should be aligned. */ + .byte 0, 0 +#ifndef PROF + /* Profiling adds some code and so changes the alignment. */ + .byte 0 +#endif + +L(4): subl $4, %esi /* adjust pointer */ +L(41): subl $4, %esi +L(42): subl $4, %esi +L(43): testl $0xff000000, %edx /* is highest byte == C? */ + jnz L(33) /* no => try other bytes */ + leal 15(%esi), %eax /* store address as result */ + jmp L(1) /* and start loop again */ + +L(3): subl $4, %esi /* adjust pointer */ +L(31): subl $4, %esi +L(32): subl $4, %esi +L(33): testl $0xff0000, %edx /* is C in third byte? */ + jnz L(51) /* no => try other bytes */ + leal 14(%esi), %eax /* store address as result */ + jmp L(1) /* and start loop again */ + +L(51): + /* At this point we know that the byte is in one of the lower bytes. + We make a guess and correct it if necessary. This reduces the + number of necessary jumps. */ + leal 12(%esi), %eax /* guess address of lowest byte as result */ + testb %dh, %dh /* is guess correct? */ + jnz L(1) /* yes => start loop */ + leal 13(%esi), %eax /* correct guess to second byte */ + +L(1): addl $16, %esi /* increment pointer for full round */ + +L(19): movl (%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occurred in the last byte => it was 0. */ + + jnc L(20) /* found NUL => check last word */ + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %edx, %edi /* (word+magic)^word */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %edi. */ + jnz L(20) /* found NUL => check last word */ + + /* Now we made sure the dword does not contain the character we are + looking for. But because we deal with strings we have to check + for the end of string before testing the next dword. */ + + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(4) /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(3) /* C is detected in the word => examine it */ + + movl 4(%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(21) /* found NUL => check last word */ + xorl %edx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(21) /* found NUL => check last word */ + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(41) /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(31) /* C is detected in the word => examine it */ + + movl 8(%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(22) /* found NUL => check last word */ + xorl %edx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(22) /* found NUL => check last word */ + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(42) /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(32) /* C is detected in the word => examine it */ + + movl 12(%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(23) /* found NUL => check last word */ + xorl %edx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L(23) /* found NUL => check last word */ + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L(43) /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L(1) /* C is not detected => restart loop */ + jmp L(33) /* examine word */ + +L(23): addl $4, %esi /* adjust pointer */ +L(22): addl $4, %esi +L(21): addl $4, %esi + + /* What remains to do is to test which byte the NUL char is and + whether the searched character appears in one of the bytes + before. A special case is that the searched byte maybe NUL. + In this case a pointer to the terminating NUL char has to be + returned. */ + +L(20): cmpb %cl, %dl /* is first byte == C? */ + jne L(24) /* no => skip */ + movl %esi, %eax /* store address as result */ +L(24): testb %dl, %dl /* is first byte == NUL? */ + jz L(2) /* yes => return */ + + cmpb %cl, %dh /* is second byte == C? */ + jne L(25) /* no => skip */ + leal 1(%esi), %eax /* store address as result */ +L(25): testb %dh, %dh /* is second byte == NUL? */ + jz L(2) /* yes => return */ + + shrl $16,%edx /* make upper bytes accessible */ + cmpb %cl, %dl /* is third byte == C */ + jne L(26) /* no => skip */ + leal 2(%esi), %eax /* store address as result */ +L(26): testb %dl, %dl /* is third byte == NUL */ + jz L(2) /* yes => return */ + + cmpb %cl, %dh /* is fourth byte == C */ + jne L(2) /* no => skip */ + leal 3(%esi), %eax /* store address as result */ + +L(2): popl %esi /* restore saved register content */ + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (strrchr) + +weak_alias (strrchr, rindex) +libc_hidden_builtin_def (strrchr) diff --git a/REORG.TODO/sysdeps/i386/strspn.S b/REORG.TODO/sysdeps/i386/strspn.S new file mode 100644 index 0000000000..d433eb6af5 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/strspn.S @@ -0,0 +1,240 @@ +/* strcspn (str, ss) -- Return the length of the initial segment of STR + which contains only characters from SS. + For Intel 80x86, x>=3. + Copyright (C) 1994-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu> + Bug fixes by Alan Modra <Alan@SPRI.Levels.UniSA.Edu.Au> + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "asm-syntax.h" + +#define PARMS 4 /* no space for saved regs */ +#define STR PARMS +#define SKIP STR+4 + + .text +ENTRY (strspn) + + movl STR(%esp), %edx + movl SKIP(%esp), %eax + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. The unrolled form is much faster than a loop. */ + xorl %ecx, %ecx /* %ecx = 0 !!! */ + + pushl %ecx /* make a 256 bytes long block filled with 0 */ + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl %ecx + cfi_adjust_cfa_offset (4) + pushl $0 /* These immediate values make the label 2 */ + cfi_adjust_cfa_offset (4) + pushl $0 /* to be aligned on a 16 byte boundary to */ + cfi_adjust_cfa_offset (4) + pushl $0 /* get a better performance of the loop. */ + cfi_adjust_cfa_offset (4) + pushl $0 + cfi_adjust_cfa_offset (4) + pushl $0 + cfi_adjust_cfa_offset (4) + pushl $0 + cfi_adjust_cfa_offset (4) + +/* For understanding the following code remember that %ecx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 32-bit value in %ecx. */ + +/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want + longer instructions so that the next loop aligns without adding nops. */ + +L(2): movb (%eax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 1(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 2(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L(1) /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 3(%eax), %cl /* get byte from stopset */ + addl $4, %eax /* increment stopset pointer */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L(2) /* no => process next dword from stopset */ + +L(1): leal -4(%edx), %eax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the character is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + +L(3): addl $4, %eax /* adjust pointer for full loop round */ + + movb (%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jz L(4) /* no => return */ + + movb 1(%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jz L(5) /* no => return */ + + movb 2(%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jz L(6) /* no => return */ + + movb 3(%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jnz L(3) /* yes => start loop again */ + + incl %eax /* adjust pointer */ +L(6): incl %eax +L(5): incl %eax + +L(4): addl $256, %esp /* remove stopset */ + cfi_adjust_cfa_offset (-256) + subl %edx, %eax /* we have to return the number of valid + characters, so compute distance to first + non-valid character */ + ret +END (strspn) +libc_hidden_builtin_def (strspn) diff --git a/REORG.TODO/sysdeps/i386/sub_n.S b/REORG.TODO/sysdeps/i386/sub_n.S new file mode 100644 index 0000000000..3649da29e7 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/sub_n.S @@ -0,0 +1,111 @@ +/* i80386 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+8 /* space for 2 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define S2 S1+4 +#define SIZE S2+4 + + .text +ENTRY (__mpn_sub_n) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + + movl RES(%esp),%edi + cfi_rel_offset (edi, 4) + movl S1(%esp),%esi + cfi_rel_offset (esi, 0) + movl S2(%esp),%edx + movl SIZE(%esp),%ecx + movl %ecx,%eax + shrl $3,%ecx /* compute count for unrolled loop */ + negl %eax + andl $7,%eax /* get index where to start loop */ + jz L(oop) /* necessary special case for 0 */ + incl %ecx /* adjust loop count */ + shll $2,%eax /* adjustment for pointers... */ + subl %eax,%edi /* ... since they are offset ... */ + subl %eax,%esi /* ... by a constant when we ... */ + subl %eax,%edx /* ... enter the loop */ + shrl $2,%eax /* restore previous value */ +#ifdef PIC +/* Calculate start address in loop for PIC. Due to limitations in some + assemblers, Loop-L0-3 cannot be put into the leal */ + call L(0) + cfi_adjust_cfa_offset (4) +L(0): leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $(L(oop)-L(0)-3),%eax + addl $4,%esp + cfi_adjust_cfa_offset (-4) +#else +/* Calculate start address in loop for non-PIC. */ + leal (L(oop) - 3)(%eax,%eax,8),%eax +#endif + jmp *%eax /* jump into loop */ + ALIGN (3) +L(oop): movl (%esi),%eax + sbbl (%edx),%eax + movl %eax,(%edi) + movl 4(%esi),%eax + sbbl 4(%edx),%eax + movl %eax,4(%edi) + movl 8(%esi),%eax + sbbl 8(%edx),%eax + movl %eax,8(%edi) + movl 12(%esi),%eax + sbbl 12(%edx),%eax + movl %eax,12(%edi) + movl 16(%esi),%eax + sbbl 16(%edx),%eax + movl %eax,16(%edi) + movl 20(%esi),%eax + sbbl 20(%edx),%eax + movl %eax,20(%edi) + movl 24(%esi),%eax + sbbl 24(%edx),%eax + movl %eax,24(%edi) + movl 28(%esi),%eax + sbbl 28(%edx),%eax + movl %eax,28(%edi) + leal 32(%edi),%edi + leal 32(%esi),%esi + leal 32(%edx),%edx + decl %ecx + jnz L(oop) + + sbbl %eax,%eax + negl %eax + + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_sub_n) diff --git a/REORG.TODO/sysdeps/i386/submul_1.S b/REORG.TODO/sysdeps/i386/submul_1.S new file mode 100644 index 0000000000..c765e8dd79 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/submul_1.S @@ -0,0 +1,86 @@ +/* i80386 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract + the result from a second limb vector. + Copyright (C) 1992-2017 Free Software Foundation, Inc. + This file is part of the GNU MP Library. + + The GNU MP Library is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation; either version 2.1 of the License, or (at your + option) any later version. + + The GNU MP Library is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public + License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with the GNU MP Library; see the file COPYING.LIB. If not, + see <http://www.gnu.org/licenses/>. */ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define PARMS 4+16 /* space for 4 saved regs */ +#define RES PARMS +#define S1 RES+4 +#define SIZE S1+4 +#define S2LIMB SIZE+4 + +#define res_ptr edi +#define s1_ptr esi +#define sizeP ecx +#define s2_limb ebx + + .text +ENTRY (__mpn_submul_1) + + pushl %edi + cfi_adjust_cfa_offset (4) + pushl %esi + cfi_adjust_cfa_offset (4) + pushl %ebp + cfi_adjust_cfa_offset (4) + pushl %ebx + cfi_adjust_cfa_offset (4) + cfi_rel_offset (edi, 12) + cfi_rel_offset (esi, 8) + cfi_rel_offset (ebp, 4) + cfi_rel_offset (ebx, 0) + + movl RES(%esp), %res_ptr + movl S1(%esp), %s1_ptr + movl SIZE(%esp), %sizeP + movl S2LIMB(%esp), %s2_limb + leal (%res_ptr,%sizeP,4), %res_ptr + leal (%s1_ptr,%sizeP,4), %s1_ptr + negl %sizeP + xorl %ebp, %ebp + ALIGN (3) +L(oop): + movl (%s1_ptr,%sizeP,4), %eax + mull %s2_limb + addl %ebp, %eax + adcl $0, %edx + subl %eax, (%res_ptr,%sizeP,4) + adcl $0, %edx + movl %edx, %ebp + + incl %sizeP + jnz L(oop) + movl %ebp, %eax + + popl %ebx + cfi_adjust_cfa_offset (-4) + cfi_restore (ebx) + popl %ebp + cfi_adjust_cfa_offset (-4) + cfi_restore (ebp) + popl %esi + cfi_adjust_cfa_offset (-4) + cfi_restore (esi) + popl %edi + cfi_adjust_cfa_offset (-4) + cfi_restore (edi) + + ret +END (__mpn_submul_1) diff --git a/REORG.TODO/sysdeps/i386/symbol-hacks.h b/REORG.TODO/sysdeps/i386/symbol-hacks.h new file mode 100644 index 0000000000..36a13c83f7 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/symbol-hacks.h @@ -0,0 +1,21 @@ +/* Hacks needed for symbol manipulation. i386 version. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdeps/wordsize-32/divdi3-symbol-hacks.h> + +#include_next "symbol-hacks.h" diff --git a/REORG.TODO/sysdeps/i386/sys/ucontext.h b/REORG.TODO/sysdeps/i386/sys/ucontext.h new file mode 100644 index 0000000000..fb5df11965 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/sys/ucontext.h @@ -0,0 +1,139 @@ +/* Copyright (C) 1997-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* System V/i386 ABI compliant context switching support. */ + +#ifndef _SYS_UCONTEXT_H +#define _SYS_UCONTEXT_H 1 + +#include <features.h> + +#include <bits/types/sigset_t.h> +#include <bits/sigcontext.h> +#include <bits/types/stack_t.h> + + +/* Type for general register. */ +typedef int greg_t; + +/* Number of general registers. */ +#define __NGREG 19 +#ifdef __USE_MISC +# define NGREG __NGREG +#endif + +/* Container for all general registers. */ +typedef greg_t gregset_t[__NGREG]; + +#ifdef __USE_MISC +/* Number of each register is the `gregset_t' array. */ +enum +{ + REG_GS = 0, +# define REG_GS REG_GS + REG_FS, +# define REG_FS REG_FS + REG_ES, +# define REG_ES REG_ES + REG_DS, +# define REG_DS REG_DS + REG_EDI, +# define REG_EDI REG_EDI + REG_ESI, +# define REG_ESI REG_ESI + REG_EBP, +# define REG_EBP REG_EBP + REG_ESP, +# define REG_ESP REG_ESP + REG_EBX, +# define REG_EBX REG_EBX + REG_EDX, +# define REG_EDX REG_EDX + REG_ECX, +# define REG_ECX REG_ECX + REG_EAX, +# define REG_EAX REG_EAX + REG_TRAPNO, +# define REG_TRAPNO REG_TRAPNO + REG_ERR, +# define REG_ERR REG_ERR + REG_EIP, +# define REG_EIP REG_EIP + REG_CS, +# define REG_CS REG_CS + REG_EFL, +# define REG_EFL REG_EFL + REG_UESP, +# define REG_UESP REG_UESP + REG_SS +# define REG_SS REG_SS +}; +#endif + +#ifdef __USE_MISC +# define __ctx(fld) fld +# define __ctxt(tag) tag +#else +# define __ctx(fld) __ ## fld +# define __ctxt(tag) /* Empty. */ +#endif + +/* Structure to describe FPU registers. */ +typedef struct fpregset + { + union + { + struct __ctxt(fpchip_state) + { + int __ctx(state)[27]; + int __ctx(status); + } __ctx(fpchip_state); + + struct __ctxt(fp_emul_space) + { + char __ctx(fp_emul)[246]; + char __ctx(fp_epad)[2]; + } __ctx(fp_emul_space); + + int __ctx(f_fpregs)[62]; + } __ctx(fp_reg_set); + + long int __ctx(f_wregs)[33]; + } fpregset_t; + +/* Context to describe whole processor state. */ +typedef struct + { + gregset_t __ctx(gregs); + fpregset_t __ctx(fpregs); + } mcontext_t; + +#undef __ctx +#undef __ctxt + +/* Userlevel context. */ +typedef struct ucontext + { + unsigned long int uc_flags; + struct ucontext *uc_link; + sigset_t uc_sigmask; + stack_t uc_stack; + mcontext_t uc_mcontext; + long int uc_filler[5]; + } ucontext_t; + +#endif /* sys/ucontext.h */ diff --git a/REORG.TODO/sysdeps/i386/sysdep.h b/REORG.TODO/sysdeps/i386/sysdep.h new file mode 100644 index 0000000000..d2b0860b99 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/sysdep.h @@ -0,0 +1,159 @@ +/* Assembler macros for i386. + Copyright (C) 1991-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdeps/generic/sysdep.h> + +#include <features.h> /* For __GNUC_PREREQ. */ + +/* It is desirable that the names of PIC thunks match those used by + GCC so that multiple copies are eliminated by the linker. Because + GCC 4.6 and earlier use __i686 in the names, it is necessary to + override that predefined macro. */ +#if defined __i686 && defined __ASSEMBLER__ +#undef __i686 +#define __i686 __i686 +#endif + +#ifdef __ASSEMBLER__ +# define GET_PC_THUNK(reg) __x86.get_pc_thunk.reg +#else +# define GET_PC_THUNK_STR(reg) "__x86.get_pc_thunk." #reg +#endif + +#ifdef __ASSEMBLER__ + +/* Syntactic details of assembler. */ + +/* ELF uses byte-counts for .align, most others use log2 of count of bytes. */ +#define ALIGNARG(log2) 1<<log2 +#define ASM_SIZE_DIRECTIVE(name) .size name,.-name; + + +/* Define an entry point visible from C. + + There is currently a bug in gdb which prevents us from specifying + incomplete stabs information. Fake some entries here which specify + the current source file. */ +#define ENTRY(name) \ + .globl C_SYMBOL_NAME(name); \ + .type C_SYMBOL_NAME(name),@function; \ + .align ALIGNARG(4); \ + C_LABEL(name) \ + cfi_startproc; \ + CALL_MCOUNT + +#undef END +#define END(name) \ + cfi_endproc; \ + ASM_SIZE_DIRECTIVE(name) + +#define ENTRY_CHK(name) ENTRY (name) +#define END_CHK(name) END (name) + +/* If compiled for profiling, call `mcount' at the start of each function. */ +#ifdef PROF +/* The mcount code relies on a normal frame pointer being on the stack + to locate our caller, so push one just for its benefit. */ +#define CALL_MCOUNT \ + pushl %ebp; cfi_adjust_cfa_offset (4); movl %esp, %ebp; \ + cfi_def_cfa_register (ebp); call JUMPTARGET(mcount); \ + popl %ebp; cfi_def_cfa (esp, 4); +#else +#define CALL_MCOUNT /* Do nothing. */ +#endif + +/* Since C identifiers are not normally prefixed with an underscore + on this system, the asm identifier `syscall_error' intrudes on the + C name space. Make sure we use an innocuous name. */ +#define syscall_error __syscall_error +#define mcount _mcount + +#define PSEUDO(name, syscall_name, args) \ + .globl syscall_error; \ +lose: SYSCALL_PIC_SETUP \ + jmp JUMPTARGET(syscall_error); \ + ENTRY (name) \ + DO_CALL (syscall_name, args); \ + jb lose + +#undef PSEUDO_END +#define PSEUDO_END(name) \ + END (name) + +# define SETUP_PIC_REG(reg) \ + .ifndef GET_PC_THUNK(reg); \ + .section .gnu.linkonce.t.GET_PC_THUNK(reg),"ax",@progbits; \ + .globl GET_PC_THUNK(reg); \ + .hidden GET_PC_THUNK(reg); \ + .p2align 4; \ + .type GET_PC_THUNK(reg),@function; \ +GET_PC_THUNK(reg): \ + movl (%esp), %e##reg; \ + ret; \ + .size GET_PC_THUNK(reg), . - GET_PC_THUNK(reg); \ + .previous; \ + .endif; \ + call GET_PC_THUNK(reg) + +# define LOAD_PIC_REG(reg) \ + SETUP_PIC_REG(reg); addl $_GLOBAL_OFFSET_TABLE_, %e##reg + +#undef JUMPTARGET +#ifdef PIC +#define JUMPTARGET(name) name##@PLT +#define SYSCALL_PIC_SETUP \ + pushl %ebx; \ + cfi_adjust_cfa_offset (4); \ + call 0f; \ +0: popl %ebx; \ + cfi_adjust_cfa_offset (-4); \ + addl $_GLOBAL_OFFSET_TABLE_+[.-0b], %ebx; + +#else +#define JUMPTARGET(name) name +#define SYSCALL_PIC_SETUP /* Nothing. */ +#endif + +/* Local label name for asm code. */ +#ifndef L +#define L(name) .L##name +#endif + +#define atom_text_section .section ".text.atom", "ax" + +#else /* __ASSEMBLER__ */ + +# define SETUP_PIC_REG_STR(reg) \ + ".ifndef " GET_PC_THUNK_STR (reg) "\n" \ + ".section .gnu.linkonce.t." GET_PC_THUNK_STR (reg) ",\"ax\",@progbits\n" \ + ".globl " GET_PC_THUNK_STR (reg) "\n" \ + ".hidden " GET_PC_THUNK_STR (reg) "\n" \ + ".p2align 4\n" \ + ".type " GET_PC_THUNK_STR (reg) ",@function\n" \ +GET_PC_THUNK_STR (reg) ":" \ + "movl (%%esp), %%e" #reg "\n" \ + "ret\n" \ + ".size " GET_PC_THUNK_STR (reg) ", . - " GET_PC_THUNK_STR (reg) "\n" \ + ".previous\n" \ + ".endif\n" \ + "call " GET_PC_THUNK_STR (reg) + +# define LOAD_PIC_REG_STR(reg) \ + SETUP_PIC_REG_STR (reg) "\naddl $_GLOBAL_OFFSET_TABLE_, %%e" #reg + +#endif /* __ASSEMBLER__ */ diff --git a/REORG.TODO/sysdeps/i386/tls-macros.h b/REORG.TODO/sysdeps/i386/tls-macros.h new file mode 100644 index 0000000000..053cba05d1 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/tls-macros.h @@ -0,0 +1,78 @@ +#include <features.h> /* For __GNUC_PREREQ. */ + +#define TLS_LE(x) \ + ({ int *__l; \ + asm ("movl %%gs:0,%0\n\t" \ + "subl $" #x "@tpoff,%0" \ + : "=r" (__l)); \ + __l; }) + +#if defined PIC && !__GNUC_PREREQ (5,0) +# define TLS_IE(x) \ + ({ int *__l; \ + asm ("movl %%gs:0,%0\n\t" \ + "subl " #x "@gottpoff(%%ebx),%0" \ + : "=r" (__l)); \ + __l; }) +#else +# define TLS_IE(x) \ + ({ int *__l, __b; \ + asm ("call 1f\n\t" \ + ".subsection 1\n" \ + "1:\tmovl (%%esp), %%ebx\n\t" \ + "ret\n\t" \ + ".previous\n\t" \ + "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t" \ + "movl %%gs:0,%0\n\t" \ + "subl " #x "@gottpoff(%%ebx),%0" \ + : "=r" (__l), "=&b" (__b)); \ + __l; }) +#endif + +#if defined PIC && !__GNUC_PREREQ (5,0) +# define TLS_LD(x) \ + ({ int *__l, __c, __d; \ + asm ("leal " #x "@tlsldm(%%ebx),%%eax\n\t" \ + "call ___tls_get_addr@plt\n\t" \ + "leal " #x "@dtpoff(%%eax), %%eax" \ + : "=a" (__l), "=&c" (__c), "=&d" (__d)); \ + __l; }) +#else +# define TLS_LD(x) \ + ({ int *__l, __b, __c, __d; \ + asm ("call 1f\n\t" \ + ".subsection 1\n" \ + "1:\tmovl (%%esp), %%ebx\n\t" \ + "ret\n\t" \ + ".previous\n\t" \ + "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t" \ + "leal " #x "@tlsldm(%%ebx),%%eax\n\t" \ + "call ___tls_get_addr@plt\n\t" \ + "leal " #x "@dtpoff(%%eax), %%eax" \ + : "=a" (__l), "=&b" (__b), "=&c" (__c), "=&d" (__d)); \ + __l; }) +#endif + +#if defined PIC && !__GNUC_PREREQ (5,0) +# define TLS_GD(x) \ + ({ int *__l, __c, __d; \ + asm ("leal " #x "@tlsgd(%%ebx),%%eax\n\t" \ + "call ___tls_get_addr@plt\n\t" \ + "nop" \ + : "=a" (__l), "=&c" (__c), "=&d" (__d)); \ + __l; }) +#else +# define TLS_GD(x) \ + ({ int *__l, __b, __c, __d; \ + asm ("call 1f\n\t" \ + ".subsection 1\n" \ + "1:\tmovl (%%esp), %%ebx\n\t" \ + "ret\n\t" \ + ".previous\n\t" \ + "addl $_GLOBAL_OFFSET_TABLE_, %%ebx\n\t" \ + "leal " #x "@tlsgd(%%ebx),%%eax\n\t" \ + "call ___tls_get_addr@plt\n\t" \ + "nop" \ + : "=a" (__l), "=&b" (__b), "=&c" (__c), "=&d" (__d)); \ + __l; }) +#endif diff --git a/REORG.TODO/sysdeps/i386/tlsdesc.c b/REORG.TODO/sysdeps/i386/tlsdesc.c new file mode 100644 index 0000000000..90de2bb05e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/tlsdesc.c @@ -0,0 +1,268 @@ +/* Manage TLS descriptors. i386 version. + Copyright (C) 2005-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <link.h> +#include <ldsodefs.h> +#include <elf/dynamic-link.h> +#include <tls.h> +#include <dl-tlsdesc.h> +#include <dl-unmap-segments.h> +#include <tlsdeschtab.h> + +/* The following 4 functions take an entry_check_offset argument. + It's computed by the caller as an offset between its entry point + and the call site, such that by adding the built-in return address + that is implicitly passed to the function with this offset, we can + easily obtain the caller's entry point to compare with the entry + point given in the TLS descriptor. If it's changed, we want to + return immediately. */ + +/* This function is used to lazily resolve TLS_DESC REL relocations + that reference the *ABS* segment in their own link maps. The + argument is the addend originally stored there. */ + +void +__attribute__ ((regparm (3))) attribute_hidden +_dl_tlsdesc_resolve_abs_plus_addend_fixup (struct tlsdesc volatile *td, + struct link_map *l, + ptrdiff_t entry_check_offset) +{ + ptrdiff_t addend = (ptrdiff_t) td->arg; + + if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0) + - entry_check_offset)) + return; + +#ifndef SHARED + CHECK_STATIC_TLS (l, l); +#else + if (!TRY_STATIC_TLS (l, l)) + { + td->arg = _dl_make_tlsdesc_dynamic (l, addend); + td->entry = _dl_tlsdesc_dynamic; + } + else +#endif + { + td->arg = (void*) (addend - l->l_tls_offset); + td->entry = _dl_tlsdesc_return; + } + + _dl_tlsdesc_wake_up_held_fixups (); +} + +/* This function is used to lazily resolve TLS_DESC REL relocations + that originally had zero addends. The argument location, that + originally held the addend, is used to hold a pointer to the + relocation, but it has to be restored before we call the function + that applies relocations. */ + +void +__attribute__ ((regparm (3))) attribute_hidden +_dl_tlsdesc_resolve_rel_fixup (struct tlsdesc volatile *td, + struct link_map *l, + ptrdiff_t entry_check_offset) +{ + const ElfW(Rel) *reloc = td->arg; + + if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0) + - entry_check_offset)) + return; + + /* The code below was borrowed from _dl_fixup(), + except for checking for STB_LOCAL. */ + const ElfW(Sym) *const symtab + = (const void *) D_PTR (l, l_info[DT_SYMTAB]); + const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]); + const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)]; + lookup_t result; + + /* Look up the target symbol. If the normal lookup rules are not + used don't look in the global scope. */ + if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL + && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0) + { + const struct r_found_version *version = NULL; + + if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL) + { + const ElfW(Half) *vernum = + (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]); + ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff; + version = &l->l_versions[ndx]; + if (version->hash == 0) + version = NULL; + } + + result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym, + l->l_scope, version, ELF_RTYPE_CLASS_PLT, + DL_LOOKUP_ADD_DEPENDENCY, NULL); + } + else + { + /* We already found the symbol. The module (and therefore its load + address) is also known. */ + result = l; + } + + if (!sym) + { + td->arg = 0; + td->entry = _dl_tlsdesc_undefweak; + } + else + { +# ifndef SHARED + CHECK_STATIC_TLS (l, result); +# else + if (!TRY_STATIC_TLS (l, result)) + { + td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value); + td->entry = _dl_tlsdesc_dynamic; + } + else +# endif + { + td->arg = (void*)(sym->st_value - result->l_tls_offset); + td->entry = _dl_tlsdesc_return; + } + } + + _dl_tlsdesc_wake_up_held_fixups (); +} + +/* This function is used to lazily resolve TLS_DESC RELA relocations. + The argument location is used to hold a pointer to the relocation. */ + +void +__attribute__ ((regparm (3))) attribute_hidden +_dl_tlsdesc_resolve_rela_fixup (struct tlsdesc volatile *td, + struct link_map *l, + ptrdiff_t entry_check_offset) +{ + const ElfW(Rela) *reloc = td->arg; + + if (_dl_tlsdesc_resolve_early_return_p (td, __builtin_return_address (0) + - entry_check_offset)) + return; + + /* The code below was borrowed from _dl_fixup(), + except for checking for STB_LOCAL. */ + const ElfW(Sym) *const symtab + = (const void *) D_PTR (l, l_info[DT_SYMTAB]); + const char *strtab = (const void *) D_PTR (l, l_info[DT_STRTAB]); + const ElfW(Sym) *sym = &symtab[ELFW(R_SYM) (reloc->r_info)]; + lookup_t result; + + /* Look up the target symbol. If the normal lookup rules are not + used don't look in the global scope. */ + if (ELFW(ST_BIND) (sym->st_info) != STB_LOCAL + && __builtin_expect (ELFW(ST_VISIBILITY) (sym->st_other), 0) == 0) + { + const struct r_found_version *version = NULL; + + if (l->l_info[VERSYMIDX (DT_VERSYM)] != NULL) + { + const ElfW(Half) *vernum = + (const void *) D_PTR (l, l_info[VERSYMIDX (DT_VERSYM)]); + ElfW(Half) ndx = vernum[ELFW(R_SYM) (reloc->r_info)] & 0x7fff; + version = &l->l_versions[ndx]; + if (version->hash == 0) + version = NULL; + } + + result = _dl_lookup_symbol_x (strtab + sym->st_name, l, &sym, + l->l_scope, version, ELF_RTYPE_CLASS_PLT, + DL_LOOKUP_ADD_DEPENDENCY, NULL); + } + else + { + /* We already found the symbol. The module (and therefore its load + address) is also known. */ + result = l; + } + + if (!sym) + { + td->arg = (void*) reloc->r_addend; + td->entry = _dl_tlsdesc_undefweak; + } + else + { +# ifndef SHARED + CHECK_STATIC_TLS (l, result); +# else + if (!TRY_STATIC_TLS (l, result)) + { + td->arg = _dl_make_tlsdesc_dynamic (result, sym->st_value + + reloc->r_addend); + td->entry = _dl_tlsdesc_dynamic; + } + else +# endif + { + td->arg = (void*) (sym->st_value - result->l_tls_offset + + reloc->r_addend); + td->entry = _dl_tlsdesc_return; + } + } + + _dl_tlsdesc_wake_up_held_fixups (); +} + +/* This function is used to avoid busy waiting for other threads to + complete the lazy relocation. Once another thread wins the race to + relocate a TLS descriptor, it sets the descriptor up such that this + function is called to wait until the resolver releases the + lock. */ + +void +__attribute__ ((regparm (3))) attribute_hidden +_dl_tlsdesc_resolve_hold_fixup (struct tlsdesc volatile *td, + struct link_map *l __attribute__((__unused__)), + ptrdiff_t entry_check_offset) +{ + /* Maybe we're lucky and can return early. */ + if (__builtin_return_address (0) - entry_check_offset != td->entry) + return; + + /* Locking here will stop execution until the running resolver runs + _dl_tlsdesc_wake_up_held_fixups(), releasing the lock. + + FIXME: We'd be better off waiting on a condition variable, such + that we didn't have to hold the lock throughout the relocation + processing. */ + __rtld_lock_lock_recursive (GL(dl_load_lock)); + __rtld_lock_unlock_recursive (GL(dl_load_lock)); +} + + +/* Unmap the dynamic object, but also release its TLS descriptor table + if there is one. */ + +void +internal_function +_dl_unmap (struct link_map *map) +{ + _dl_unmap_segments (map); + +#ifdef SHARED + if (map->l_mach.tlsdesc_table) + htab_delete (map->l_mach.tlsdesc_table); +#endif +} diff --git a/REORG.TODO/sysdeps/i386/tlsdesc.sym b/REORG.TODO/sysdeps/i386/tlsdesc.sym new file mode 100644 index 0000000000..33854975d0 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/tlsdesc.sym @@ -0,0 +1,17 @@ +#include <stddef.h> +#include <sysdep.h> +#include <tls.h> +#include <link.h> +#include <dl-tlsdesc.h> + +-- + +-- Abuse tls.h macros to derive offsets relative to the thread register. + +DTV_OFFSET offsetof(struct pthread, header.dtv) + +TLSDESC_ARG offsetof(struct tlsdesc, arg) + +TLSDESC_GEN_COUNT offsetof(struct tlsdesc_dynamic_arg, gen_count) +TLSDESC_MODID offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_module) +TLSDESC_MODOFF offsetof(struct tlsdesc_dynamic_arg, tlsinfo.ti_offset) diff --git a/REORG.TODO/sysdeps/i386/tst-audit.h b/REORG.TODO/sysdeps/i386/tst-audit.h new file mode 100644 index 0000000000..87bf199c85 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/tst-audit.h @@ -0,0 +1,25 @@ +/* Definitions for testing PLT entry/exit auditing. i386 version. + + Copyright (C) 2012-2017 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + <http://www.gnu.org/licenses/>. */ + +#define pltenter la_i86_gnu_pltenter +#define pltexit la_i86_gnu_pltexit +#define La_regs La_i86_regs +#define La_retval La_i86_retval +#define int_retval lrv_eax diff --git a/REORG.TODO/sysdeps/i386/tst-audit3.c b/REORG.TODO/sysdeps/i386/tst-audit3.c new file mode 100644 index 0000000000..b67a59d733 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/tst-audit3.c @@ -0,0 +1,37 @@ +/* Test case for i386 preserved registers in dynamic linker. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdlib.h> +#include "tst-audit3.h" + +static int +do_test (void) +{ + long long ll = audit1_test (1, 2, 3); + if (ll != 30) + abort (); + + float f = audit2_test (1, 2, 3); + if (f != 30) + abort (); + + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../../test-skeleton.c" diff --git a/REORG.TODO/sysdeps/i386/tst-audit3.h b/REORG.TODO/sysdeps/i386/tst-audit3.h new file mode 100644 index 0000000000..f6d3b9181e --- /dev/null +++ b/REORG.TODO/sysdeps/i386/tst-audit3.h @@ -0,0 +1,20 @@ +/* Test case for i386 preserved registers in dynamic linker. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +extern long long audit1_test (int, int, int) __attribute__ ((regparm(3))); +extern float audit2_test (int, int, int) __attribute__ ((regparm(3))); diff --git a/REORG.TODO/sysdeps/i386/tst-auditmod3a.c b/REORG.TODO/sysdeps/i386/tst-auditmod3a.c new file mode 100644 index 0000000000..a333cdcff9 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/tst-auditmod3a.c @@ -0,0 +1,38 @@ +/* Test case for i386 preserved registers in dynamic linker. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdlib.h> +#include "tst-audit3.h" + +long long +__attribute__ ((regparm(3))) +audit1_test (int i, int j, int k) +{ + if (i != 1 || j != 2 || k != 3) + abort (); + return 30; +} + +float +__attribute__ ((regparm(3))) +audit2_test (int i, int j, int k) +{ + if (i != 1 || j != 2 || k != 3) + abort (); + return 30; +} diff --git a/REORG.TODO/sysdeps/i386/tst-auditmod3b.c b/REORG.TODO/sysdeps/i386/tst-auditmod3b.c new file mode 100644 index 0000000000..523f3cec90 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/tst-auditmod3b.c @@ -0,0 +1,186 @@ +/* Test case for i386 preserved registers in dynamic linker. + Copyright (C) 2015-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <dlfcn.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <stddef.h> +#include <string.h> +#include <unistd.h> +#include <link.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + const char *flagstr; + switch (flag) + { + case LA_ACT_CONSISTENT: + flagstr = "consistent"; + break; + case LA_ACT_ADD: + flagstr = "add"; + break; + case LA_ACT_DELETE: + flagstr = "delete"; + break; + default: + printf ("activity: unknown activity %u\n", flag); + return; + } + printf ("activity: %s\n", flagstr); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + const char *flagstr; + switch (flag) + { + case LA_SER_ORIG: + flagstr = "LA_SET_ORIG"; + break; + case LA_SER_LIBPATH: + flagstr = "LA_SER_LIBPATH"; + break; + case LA_SER_RUNPATH: + flagstr = "LA_SER_RUNPATH"; + break; + case LA_SER_CONFIG: + flagstr = "LA_SER_CONFIG"; + break; + case LA_SER_DEFAULT: + flagstr = "LA_SER_DEFAULT"; + break; + case LA_SER_SECURE: + flagstr = "LA_SER_SECURE"; + break; + default: + printf ("objsearch: %s, unknown flag %d\n", name, flag); + return (char *) name; + } + + printf ("objsearch: %s, %s\n", name, flagstr); + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#include "tst-audit.h" + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + if (strcmp (symname, "audit1_test") == 0 + || strcmp (symname, "audit2_test") == 0) + { + if (regs->lr_eax != 1 + || regs->lr_edx != 2 + || regs->lr_ecx != 3) + abort (); + + *framesizep = 200; + } + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, + (ptrdiff_t) outregs->int_retval); + + if (strcmp (symname, "audit1_test") == 0 + || strcmp (symname, "audit2_test") == 0) + { + if (inregs->lr_eax != 1 + || inregs->lr_edx != 2 + || inregs->lr_ecx != 3) + abort (); + + if (strcmp (symname, "audit1_test") == 0) + { + long long x = ((unsigned long long) outregs->lrv_eax + | (unsigned long long) outregs->lrv_edx << 32); + + if (x != 30) + abort (); + } + else if (strcmp (symname, "audit2_test") == 0) + { + if (outregs->lrv_st0 != 30) + abort (); + } + } + + return 0; +} diff --git a/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh b/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh new file mode 100755 index 0000000000..83a1dc59fb --- /dev/null +++ b/REORG.TODO/sysdeps/i386/tst-ld-sse-use.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# Make sure no code in ld.so uses xmm/ymm/zmm registers on i386. +# Copyright (C) 2009-2017 Free Software Foundation, Inc. +# This file is part of the GNU C Library. + +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. + +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. + +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# <http://www.gnu.org/licenses/>. + +set -e + +objpfx="$1" +NM="$2" +OBJDUMP="$3" +READELF="$4" + +tmp=$(mktemp ${objpfx}tst-ld-sse-use.XXXXXX) +trap 'rm -f "$tmp"' 1 2 3 15 + +# List of object files we have to test +rtldobjs=$($READELF -W -wi ${objpfx}dl-allobjs.os | + awk '/^ </ { if ($5 == "(DW_TAG_compile_unit)") c=1; else c=0 } $2 == "DW_AT_name" { if (c == 1) print $NF }' | + sed 's,\(.*/\|\)\([_[:alnum:]-]*[.]\).$,\2os,') +rtldobjs="$rtldobjs $(ar t ${objpfx}rtld-libc.a)" + +# OBJECT symbols can be ignored. +$READELF -sW ${objpfx}dl-allobjs.os ${objpfx}rtld-libc.a | +egrep " OBJECT *GLOBAL " | +awk '{if ($7 != "ABS") print $8 }' | +sort -u > "$tmp" +declare -a objects +objects=($(cat "$tmp")) + +objs="dl-runtime.os" +tocheck="dl-runtime.os" + +while test -n "$objs"; do + this="$objs" + objs="" + + for f in $this; do + undef=$($NM -u "$objpfx"../*/"$f" | awk '{print $2}') + if test -n "$undef"; then + for s in $undef; do + for obj in ${objects[*]} "_GLOBAL_OFFSET_TABLE_"; do + if test "$obj" = "$s"; then + continue 2 + fi + done + for o in $rtldobjs; do + ro=$(echo "$objpfx"../*/"$o") + if $NM -g --defined-only "$ro" | egrep -qs " $s\$"; then + if ! (echo "$tocheck $objs" | fgrep -qs "$o"); then + echo "$o needed for $s" + objs="$objs $o" + fi + break; + fi + done + done + fi + done + tocheck="$tocheck$objs" +done + +echo +echo +echo "object files needed: $tocheck" + +cp /dev/null "$tmp" +for f in $tocheck; do + $OBJDUMP -d "$objpfx"../*/"$f" | + awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xyz]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' | + while read fct; do + if test "$fct" = "_dl_runtime_profile" -o "$fct" = "_dl_x86_64_restore_sse"; then + continue; + fi + echo "function $fct in $f modifies xmm/ymm/zmm" >> "$tmp" + result=1 + done +done + +if test -s "$tmp"; then + echo + echo + cat "$tmp" + result=1 +else + result=0 +fi + +rm "$tmp" +exit $result diff --git a/REORG.TODO/sysdeps/i386/tst-stack-align.h b/REORG.TODO/sysdeps/i386/tst-stack-align.h new file mode 100644 index 0000000000..76276d4a28 --- /dev/null +++ b/REORG.TODO/sysdeps/i386/tst-stack-align.h @@ -0,0 +1,41 @@ +/* Copyright (C) 2004-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdio.h> +#include <stdint.h> + +typedef struct { int i[4]; } int_al16 __attribute__((aligned (16))); + +#define TEST_STACK_ALIGN() \ + ({ \ + int_al16 _m; \ + double _d = 12.0; \ + long double _ld = 15.0; \ + int _ret = 0; \ + printf ("int_al16: %p %zu\n", &_m, __alignof (int_al16)); \ + if ((((uintptr_t) &_m) & (__alignof (int_al16) - 1)) != 0) \ + _ret = 1; \ + \ + printf ("double: %g %p %zu\n", _d, &_d, __alignof (double)); \ + if ((((uintptr_t) &_d) & (__alignof (double) - 1)) != 0) \ + _ret = 1; \ + \ + printf ("ldouble: %Lg %p %zu\n", _ld, &_ld, __alignof (long double)); \ + if ((((uintptr_t) &_ld) & (__alignof (long double) - 1)) != 0) \ + _ret = 1; \ + _ret; \ + }) |