diff options
45 files changed, 4437 insertions, 358 deletions
diff --git a/ChangeLog b/ChangeLog index 6f7026debf..a2fbd61fe6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,111 @@ +2009-07-26 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/tst-xmmymm.sh: New file. Check whether any of the + functions used in ld.so modify xmm/ymm registers. + * sysdeps/x86_64/Makefile: Hook new test up. + * sysdeps/x86_64/rtld-memchr.c: New file. + * sysdeps/x86_64/rtld-memcmp.c: New file. + * sysdeps/x86_64/rtld-rawmemchr.c: New file. + * sysdeps/x86_64/rtld-strchr.S: New file. + * sysdeps/x86_64/rtld-strcmp.S: New file. + * sysdeps/x86_64/rtld-strlen.S: New file. + * sysdeps/x86_64/multiarch/rtld-rawmemchr.c: New file. + * sysdeps/x86_64/multiarch/rtld-strlen.S: New file. + +2009-07-26 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove + strncmp-c. + * sysdeps/x86_64/multiarch/strcmp.S (aftertail): Removed. + (exit): Likewise. + (Byte1): Likewise. + (Byte2): Likewise. + (Byte3): Likewise. + (Byte4): Likewise. + (Byte5): Likewise. + (Byte6): Likewise. + (next_8_bytes): Likewise. + (Byte0): Remove commented out codes. + (unaligned_table): Align jump table at 8 bytes. + Add _sse4_2 to all labels. Always include "../strcmp.S". + * sysdeps/x86_64/multiarch/strncmp-c.c: Removed. + * sysdeps/x86_64/strcmp.S: Add SSE2 support. + * sysdeps/x86_64/strncmp.S: New file. + +2009-07-26 Ulrich Drepper <drepper@redhat.com> + + [BZ #10422] + * sysdeps/unix/sysv/linux/eventfd.c: Add compatibility for old + kernels, dropped when eventfd2 support was added. + * sysdeps/unix/sysv/linux/signalfd.c: Add compatibility for old + kernels, dropped when signalfd4 support was added. + * sysdeps/unix/sysv/linux/kernel-features.h: More CLOEXEC syscalls + added, name them. + + [BZ #10452] + * resolv/res_send.c (send_dg): Pass full SERVFAIL, NOTIMP, REFUSED + replies up. + + * elf/elf.h: Define NT_GNU_GOLD_VERSION. + +2009-07-25 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/multiarch/strcmp.S: Some more optimizations for + modern processor versions. Patch by H.J. Lu <hongjiu.lu@intel.com>. + + [BZ #10448] + * sysdeps/posix/getaddrinfo.c (gaih_inet): If NSS module contains no + callback we must touch the status to avoid using stale value. + + * sysdeps/x86_64/multiarch/strcmp.S: Exclude unused code from being + compiled in. + +2009-07-24 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/unix/sysv/linux/configure.in: Don't automatically include + /lib/modules/* headers anymore. We have sane headers in the standard + place now. + +2009-06-16 Andreas Krebbel <Andreas.Krebbel@de.ibm.com> + + * sysdeps/s390/dl-procinfo.c (_dl_s390_cap_flags): "hpage", + "etf3enh" and "highgprs" added. + (_dl_s390_platforms): "z10" added. + * sysdeps/s390/dl-procinfo.h (_DL_HWCAP_COUNT, _DL_PLATFORMS_COUNT): + Increased for the new entries. + (HWCAP enum): HWCAP_S390_HPAGE, HWCAP_S390_ETF3EH and + HWCAP_S390_HIGH_GPRS added. + + * sysdeps/s390/s390-64/Makefile: Adjusted to build the new modules. + * sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c: New file. + * sysdeps/s390/s390-64/utf16-utf32-z9.c: New file. + * sysdeps/s390/s390-64/utf8-utf16-z9.c: New file. + * sysdeps/s390/s390-64/utf8-utf32-z9.c: New file. + 2009-07-23 Ulrich Drepper <drepper@redhat.com> + * sysdeps/x86_64/cacheinfo.c [USE_MULTIARCH]: Rearrange code to + avoid additional cpuid instructions. Most of the information is + stored somewhere. + + * sysdeps/unix/sysv/linux/i386/sysconf.c (intel_02_known): Add more + cache descriptors. + * sysdeps/x86_64/cacheinfo.c (intel_02_known): Likewise. + + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Reset + SSSE3 bit for Atoms. + * sysdeps/x86_64/multiarch/strcpy.S: New need to perform Atom test + here anymore. + + * posix/tst-rfc3484.c (do_test): Initialize entire sockaddr_in + structure before copying it to avoid warning. + * posix/tst-rfc3484-2.c (do_test): Likewise. + * posix/tst-rfc3484-3.c (do_test): Likewise. + + [BZ #10416] + * include/unistd.h: Make header file suitable for C++ test cases. + Patch by Duncan Simpson <dps@simpson.demon.co.uk>. + * sysdeps/unix/sysv/linux/i386/makecontext.S: Ensure we preserve the stack alignment in the exit code. diff --git a/elf/elf.h b/elf/elf.h index 7efdedefb4..ce6de07e91 100644 --- a/elf/elf.h +++ b/elf/elf.h @@ -1054,6 +1054,9 @@ typedef struct The descriptor consists of any nonzero number of bytes. */ #define NT_GNU_BUILD_ID 3 +/* Version note generated by GNU gold containing a version string. */ +#define NT_GNU_GOLD_VERSION 4 + /* Move records. */ typedef struct diff --git a/include/unistd.h b/include/unistd.h index 72d7e2e88c..ccba893abe 100644 --- a/include/unistd.h +++ b/include/unistd.h @@ -1,6 +1,8 @@ #ifndef _UNISTD_H # include <posix/unistd.h> +__BEGIN_DECLS + libc_hidden_proto (_exit, __noreturn__) libc_hidden_proto (alarm) libc_hidden_proto (confstr) @@ -174,4 +176,6 @@ extern int __have_sock_cloexec; unless it is really necessary. */ #define __have_pipe2 __have_sock_cloexec +__END_DECLS + #endif diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 1f24aa5849..c485435e82 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,3 +1,17 @@ +2009-07-26 Ulrich Drepper <drepper@redhat.com> + + [BZ #10418] + * pthread_mutex_lock.c (pthread_mutex_lock): Use _rel instead of of + _acq variants of cmpxchg. + * pthread_mutex_timedlock.c (pthread_mutex_timedlock): Likewise. + +2009-07-23 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/configure.in: New file. + + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S: Fix error + path when not using absolute timeout futex. + 2009-07-20 Ulrich Drepper <drepper@redhat.com> * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S: Minor diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c index 406e588fdb..a0ff881faf 100644 --- a/nptl/pthread_mutex_lock.c +++ b/nptl/pthread_mutex_lock.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2002-2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -160,7 +160,7 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) #endif newval - = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, + = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock, newval, oldval); if (newval != oldval) @@ -285,7 +285,7 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) #ifdef NO_INCR newval |= FUTEX_WAITERS; #endif - oldval = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, + oldval = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock, newval, 0); if (oldval != 0) @@ -420,7 +420,7 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) oldprio = ceiling; oldval - = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, + = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock, #ifdef NO_INCR ceilval | 2, #else @@ -434,7 +434,7 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) do { oldval - = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, + = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock, ceilval | 2, ceilval | 1); @@ -445,7 +445,7 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) lll_futex_wait (&mutex->__data.__lock, ceilval | 2, PTHREAD_MUTEX_PSHARED (mutex)); } - while (atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, + while (atomic_compare_and_exchange_val_rel (&mutex->__data.__lock, ceilval | 2, ceilval) != ceilval); } diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c index 8d0db79d58..2c6ff114da 100644 --- a/nptl/pthread_mutex_timedlock.c +++ b/nptl/pthread_mutex_timedlock.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2002-2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -126,7 +126,7 @@ pthread_mutex_timedlock (mutex, abstime) int newval = id | (oldval & FUTEX_WAITERS); newval - = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, + = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock, newval, oldval); if (newval != oldval) { @@ -246,7 +246,7 @@ pthread_mutex_timedlock (mutex, abstime) } } - oldval = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, + oldval = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock, id, 0); if (oldval != 0) @@ -404,7 +404,7 @@ pthread_mutex_timedlock (mutex, abstime) oldprio = ceiling; oldval - = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, + = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock, ceilval | 1, ceilval); if (oldval == ceilval) @@ -413,7 +413,7 @@ pthread_mutex_timedlock (mutex, abstime) do { oldval - = atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, + = atomic_compare_and_exchange_val_rel (&mutex->__data.__lock, ceilval | 2, ceilval | 1); @@ -456,7 +456,7 @@ pthread_mutex_timedlock (mutex, abstime) PTHREAD_MUTEX_PSHARED (mutex)); } } - while (atomic_compare_and_exchange_val_acq (&mutex->__data.__lock, + while (atomic_compare_and_exchange_val_rel (&mutex->__data.__lock, ceilval | 2, ceilval) != ceilval); } diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index e12790cb96..7486825d5f 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -551,12 +551,12 @@ __pthread_cond_timedwait: jne 53b cmpq 24(%rsp), %r9 - jbe 45b + jbe 15f cmpq %rax, %r9 ja 39b - cmpq $-ETIMEDOUT, %r14 +15: cmpq $-ETIMEDOUT, %r14 jne 8b jmp 99b diff --git a/nptl/sysdeps/x86_64/configure b/nptl/sysdeps/x86_64/configure new file mode 100644 index 0000000000..b959168843 --- /dev/null +++ b/nptl/sysdeps/x86_64/configure @@ -0,0 +1,36 @@ +# This file is generated from configure.in by Autoconf. DO NOT EDIT! + # Local configure fragment for sysdeps/i386. + +{ echo "$as_me:$LINENO: checking for .cfi_personality and .cfi_lsda pseudo-ops" >&5 +echo $ECHO_N "checking for .cfi_personality and .cfi_lsda pseudo-ops... $ECHO_C" >&6; } +if test "${libc_cv_asm_cfi_personality+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 +else + cat > conftest.s <<EOF +${libc_cv_dot_text} +foo: + .cfi_startproc + .cfi_personality 0, foo + .cfi_lsda 0, foo + .cfi_endproc +EOF + if { ac_try='${CC-cc} $ASFLAGS -c conftest.s 1>&5' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_asm_cfi_personality=yes + else + libc_cv_asm_cfi_personality=no + fi + rm -f conftest* + +fi +{ echo "$as_me:$LINENO: result: $libc_cv_asm_cfi_personality" >&5 +echo "${ECHO_T}$libc_cv_asm_cfi_personality" >&6; } +if test x"$libc_cv_asm_cfi_personality" != xyes; then + { { echo "$as_me:$LINENO: error: assembler too old, .cfi_personality support missing" >&5 +echo "$as_me: error: assembler too old, .cfi_personality support missing" >&2;} + { (exit 1); exit 1; }; } +fi diff --git a/nptl/sysdeps/x86_64/configure.in b/nptl/sysdeps/x86_64/configure.in new file mode 100644 index 0000000000..0ba0cc3726 --- /dev/null +++ b/nptl/sysdeps/x86_64/configure.in @@ -0,0 +1,23 @@ +GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. +# Local configure fragment for sysdeps/i386. + +AC_CACHE_CHECK([for .cfi_personality and .cfi_lsda pseudo-ops], + libc_cv_asm_cfi_personality, [dnl + cat > conftest.s <<EOF +${libc_cv_dot_text} +foo: + .cfi_startproc + .cfi_personality 0, foo + .cfi_lsda 0, foo + .cfi_endproc +EOF + if AC_TRY_COMMAND(${CC-cc} $ASFLAGS -c conftest.s 1>&AS_MESSAGE_LOG_FD); then + libc_cv_asm_cfi_personality=yes + else + libc_cv_asm_cfi_personality=no + fi + rm -f conftest* +]) +if test x"$libc_cv_asm_cfi_personality" != xyes; then + AC_MSG_ERROR([assembler too old, .cfi_personality support missing]) +fi diff --git a/posix/tst-rfc3484-2.c b/posix/tst-rfc3484-2.c index c85fdd0742..bf5f6cff7e 100644 --- a/posix/tst-rfc3484-2.c +++ b/posix/tst-rfc3484-2.c @@ -82,6 +82,8 @@ do_test (void) struct sockaddr_in so1; so1.sin_family = AF_INET; so1.sin_addr.s_addr = h (0xc0a85f19); + /* Clear the rest of the structure to avoid warnings. */ + memset (so1.sin_zero, '\0', sizeof (so1.sin_zero)); struct sockaddr_in sa1; sa1.sin_family = AF_INET; diff --git a/posix/tst-rfc3484-3.c b/posix/tst-rfc3484-3.c index 3aa4563c0c..8eba74e48e 100644 --- a/posix/tst-rfc3484-3.c +++ b/posix/tst-rfc3484-3.c @@ -113,6 +113,8 @@ do_test (void) struct sockaddr_in so; so.sin_family = AF_INET; so.sin_addr.s_addr = h (0x0aa85f19); + /* Clear the rest of the structure to avoid warnings. */ + memset (so.sin_zero, '\0', sizeof (so.sin_zero)); for (int i = 0; i < naddrs; ++i) { diff --git a/posix/tst-rfc3484.c b/posix/tst-rfc3484.c index 15d0c94a5e..26835cf8b2 100644 --- a/posix/tst-rfc3484.c +++ b/posix/tst-rfc3484.c @@ -102,6 +102,8 @@ do_test (void) struct sockaddr_in so; so.sin_family = AF_INET; so.sin_addr.s_addr = h (0xc0a85f19); + /* Clear the rest of the structure to avoid warnings. */ + memset (so.sin_zero, '\0', sizeof (so.sin_zero)); for (int i = 0; i < naddrs; ++i) { diff --git a/resolv/res_send.c b/resolv/res_send.c index 971a4afb6f..4c14db1bf5 100644 --- a/resolv/res_send.c +++ b/resolv/res_send.c @@ -1278,14 +1278,10 @@ send_dg(res_state statp, ? *thisanssiz : *thisresplen); if (recvresp1 || (buf2 != NULL && recvresp2)) - { - *resplen2 = 1; - return resplen; - } + return resplen; if (buf2 != NULL) { /* We are waiting for a possible second reply. */ - resplen = 1; if (hp->id == anhp->id) recvresp1 = 1; else diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c index d346c621fb..a788d18fee 100644 --- a/sysdeps/posix/getaddrinfo.c +++ b/sysdeps/posix/getaddrinfo.c @@ -833,6 +833,8 @@ gaih_inet (const char *name, const struct gaih_service *service, && inet6_status != NSS_STATUS_UNAVAIL) status = inet6_status; } + else + status = NSS_STATUS_UNAVAIL; } if (nss_next_action (nip, status) == NSS_ACTION_RETURN) diff --git a/sysdeps/s390/dl-procinfo.c b/sysdeps/s390/dl-procinfo.c index 32c6aef951..d51d7b2379 100644 --- a/sysdeps/s390/dl-procinfo.c +++ b/sysdeps/s390/dl-procinfo.c @@ -1,5 +1,5 @@ /* Data for s390 version of processor capability information. - Copyright (C) 2006 Free Software Foundation, Inc. + Copyright (C) 2006, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2006. @@ -47,11 +47,11 @@ #if !defined PROCINFO_DECL && defined SHARED ._dl_s390_cap_flags #else -PROCINFO_CLASS const char _dl_s390_cap_flags[7][6] +PROCINFO_CLASS const char _dl_s390_cap_flags[10][8] #endif #ifndef PROCINFO_DECL = { - "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp" + "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", "hpage", "etf3enh", "highgprs" } #endif #if !defined SHARED || defined PROCINFO_DECL @@ -63,11 +63,11 @@ PROCINFO_CLASS const char _dl_s390_cap_flags[7][6] #if !defined PROCINFO_DECL && defined SHARED ._dl_s390_platforms #else -PROCINFO_CLASS const char _dl_s390_platforms[4][7] +PROCINFO_CLASS const char _dl_s390_platforms[5][7] #endif #ifndef PROCINFO_DECL = { - "g5", "z900", "z990", "z9-109" + "g5", "z900", "z990", "z9-109", "z10" } #endif #if !defined SHARED || defined PROCINFO_DECL diff --git a/sysdeps/s390/dl-procinfo.h b/sysdeps/s390/dl-procinfo.h index 178d7cc017..0a7ebd3be9 100644 --- a/sysdeps/s390/dl-procinfo.h +++ b/sysdeps/s390/dl-procinfo.h @@ -1,5 +1,5 @@ /* s390 version of processor capability information handling macros. - Copyright (C) 2006 Free Software Foundation, Inc. + Copyright (C) 2006, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2006. @@ -22,9 +22,9 @@ #define _DL_PROCINFO_H 1 #include <ldsodefs.h> -#define _DL_HWCAP_COUNT 7 +#define _DL_HWCAP_COUNT 10 -#define _DL_PLATFORMS_COUNT 4 +#define _DL_PLATFORMS_COUNT 5 /* The kernel provides up to 32 capability bits with elf_hwcap. */ #define _DL_FIRST_PLATFORM 32 @@ -45,6 +45,9 @@ enum HWCAP_S390_LDISP = 1 << 4, HWCAP_S390_EIMM = 1 << 5, HWCAP_S390_DFP = 1 << 6, + HWCAP_S390_HPAGE = 1 << 7, + HWCAP_S390_ETF3EH = 1 << 8, + HWCAP_S390_HIGH_GPRS = 1 << 9, }; #define HWCAP_IMPORTANT (HWCAP_S390_ZARCH | HWCAP_S390_LDISP \ diff --git a/sysdeps/s390/s390-64/Makefile b/sysdeps/s390/s390-64/Makefile index 0a5051449d..1814f37abd 100644 --- a/sysdeps/s390/s390-64/Makefile +++ b/sysdeps/s390/s390-64/Makefile @@ -9,3 +9,70 @@ CFLAGS-rtld.c += -Wno-uninitialized -Wno-unused CFLAGS-dl-load.c += -Wno-unused CFLAGS-dl-reloc.c += -Wno-unused endif + +ifeq ($(subdir),iconvdata) +ISO-8859-1_CP037_Z900-routines := iso-8859-1_cp037_z900 +ISO-8859-1_CP037_Z900-map := gconv.map + +UTF8_UTF32_Z9-routines := utf8-utf32-z9 +UTF8_UTF32_Z9-map := gconv.map + +UTF16_UTF32_Z9-routines := utf16-utf32-z9 +UTF16_UTF32_Z9-map := gconv.map + +UTF8_UTF16_Z9-routines := utf8-utf16-z9 +UTF8_UTF16_Z9-map := gconv.map + +s390x-iconv-modules = ISO-8859-1_CP037_Z900 UTF8_UTF16_Z9 UTF16_UTF32_Z9 UTF8_UTF32_Z9 + +extra-modules-left += $(s390x-iconv-modules) +include extra-module.mk + +extra-objs += $(addsuffix .so, $(s390x-iconv-modules)) +install-others += $(patsubst %, $(inst_gconvdir)/%.so, $(s390x-iconv-modules)) + +distribute += iso-8859-1_cp037_z900.c utf8-utf32-z9.c utf16-utf32-z9.c utf8-utf16-z9.c + +$(patsubst %, $(inst_gconvdir)/%.so, $(s390x-iconv-modules)) : \ +$(inst_gconvdir)/%.so: $(objpfx)%.so $(+force) + $(do-install-program) + +$(objpfx)gconv-modules-s390: gconv-modules $(+force) + cp $< $@ + echo >> $@ + echo "# S/390 hardware accelerated modules" >> $@ + echo -n "module ISO-8859-1// IBM037// " >> $@ + echo " ISO-8859-1_CP037_Z900 1" >> $@ + echo -n "module IBM037// ISO-8859-1// " >> $@ + echo " ISO-8859-1_CP037_Z900 1" >> $@ + echo -n "module ISO-10646/UTF8/ UTF-32// " >> $@ + echo " UTF8_UTF32_Z9 1" >> $@ + echo -n "module UTF-32BE// ISO-10646/UTF8/ " >> $@ + echo " UTF8_UTF32_Z9 1" >> $@ + echo -n "module ISO-10646/UTF8/ UTF-32BE// " >> $@ + echo " UTF8_UTF32_Z9 1" >> $@ + echo -n "module UTF-16BE// UTF-32// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module UTF-32BE// UTF-16// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module INTERNAL UTF-16// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module UTF-32BE// UTF-16BE// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module INTERNAL UTF-16BE// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module UTF-16BE// UTF-32BE// " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module UTF-16BE// INTERNAL " >> $@ + echo " UTF16_UTF32_Z9 1" >> $@ + echo -n "module UTF-16BE// ISO-10646/UTF8/ " >> $@ + echo " UTF8_UTF16_Z9 1" >> $@ + echo -n "module ISO-10646/UTF8/ UTF-16// " >> $@ + echo " UTF8_UTF16_Z9 1" >> $@ + echo -n "module ISO-10646/UTF8/ UTF-16BE// " >> $@ + echo " UTF8_UTF16_Z9 1" >> $@ + +$(inst_gconvdir)/gconv-modules: $(objpfx)gconv-modules-s390 $(+force) + $(do-install) + +endif diff --git a/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c new file mode 100644 index 0000000000..d4c4931f22 --- /dev/null +++ b/sysdeps/s390/s390-64/iso-8859-1_cp037_z900.c @@ -0,0 +1,238 @@ +/* Conversion between ISO 8859-1 and IBM037. + + This module uses the Z900 variant of the Translate One To One + instruction. + Copyright (C) 1997-2009 Free Software Foundation, Inc. + + Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com> + Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997. + + Thanks to Daniel Appich who covered the relevant performance work + in his diploma thesis. + + This is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <dlfcn.h> +#include <stdint.h> + +// conversion table from ISO-8859-1 to IBM037 +static const unsigned char table_iso8859_1_to_cp037[256] +__attribute__ ((aligned (8))) = +{ + [0x00] = 0x00, [0x01] = 0x01, [0x02] = 0x02, [0x03] = 0x03, + [0x04] = 0x37, [0x05] = 0x2D, [0x06] = 0x2E, [0x07] = 0x2F, + [0x08] = 0x16, [0x09] = 0x05, [0x0A] = 0x25, [0x0B] = 0x0B, + [0x0C] = 0x0C, [0x0D] = 0x0D, [0x0E] = 0x0E, [0x0F] = 0x0F, + [0x10] = 0x10, [0x11] = 0x11, [0x12] = 0x12, [0x13] = 0x13, + [0x14] = 0x3C, [0x15] = 0x3D, [0x16] = 0x32, [0x17] = 0x26, + [0x18] = 0x18, [0x19] = 0x19, [0x1A] = 0x3F, [0x1B] = 0x27, + [0x1C] = 0x1C, [0x1D] = 0x1D, [0x1E] = 0x1E, [0x1F] = 0x1F, + [0x20] = 0x40, [0x21] = 0x5A, [0x22] = 0x7F, [0x23] = 0x7B, + [0x24] = 0x5B, [0x25] = 0x6C, [0x26] = 0x50, [0x27] = 0x7D, + [0x28] = 0x4D, [0x29] = 0x5D, [0x2A] = 0x5C, [0x2B] = 0x4E, + [0x2C] = 0x6B, [0x2D] = 0x60, [0x2E] = 0x4B, [0x2F] = 0x61, + [0x30] = 0xF0, [0x31] = 0xF1, [0x32] = 0xF2, [0x33] = 0xF3, + [0x34] = 0xF4, [0x35] = 0xF5, [0x36] = 0xF6, [0x37] = 0xF7, + [0x38] = 0xF8, [0x39] = 0xF9, [0x3A] = 0x7A, [0x3B] = 0x5E, + [0x3C] = 0x4C, [0x3D] = 0x7E, [0x3E] = 0x6E, [0x3F] = 0x6F, + [0x40] = 0x7C, [0x41] = 0xC1, [0x42] = 0xC2, [0x43] = 0xC3, + [0x44] = 0xC4, [0x45] = 0xC5, [0x46] = 0xC6, [0x47] = 0xC7, + [0x48] = 0xC8, [0x49] = 0xC9, [0x4A] = 0xD1, [0x4B] = 0xD2, + [0x4C] = 0xD3, [0x4D] = 0xD4, [0x4E] = 0xD5, [0x4F] = 0xD6, + [0x50] = 0xD7, [0x51] = 0xD8, [0x52] = 0xD9, [0x53] = 0xE2, + [0x54] = 0xE3, [0x55] = 0xE4, [0x56] = 0xE5, [0x57] = 0xE6, + [0x58] = 0xE7, [0x59] = 0xE8, [0x5A] = 0xE9, [0x5B] = 0xBA, + [0x5C] = 0xE0, [0x5D] = 0xBB, [0x5E] = 0xB0, [0x5F] = 0x6D, + [0x60] = 0x79, [0x61] = 0x81, [0x62] = 0x82, [0x63] = 0x83, + [0x64] = 0x84, [0x65] = 0x85, [0x66] = 0x86, [0x67] = 0x87, + [0x68] = 0x88, [0x69] = 0x89, [0x6A] = 0x91, [0x6B] = 0x92, + [0x6C] = 0x93, [0x6D] = 0x94, [0x6E] = 0x95, [0x6F] = 0x96, + [0x70] = 0x97, [0x71] = 0x98, [0x72] = 0x99, [0x73] = 0xA2, + [0x74] = 0xA3, [0x75] = 0xA4, [0x76] = 0xA5, [0x77] = 0xA6, + [0x78] = 0xA7, [0x79] = 0xA8, [0x7A] = 0xA9, [0x7B] = 0xC0, + [0x7C] = 0x4F, [0x7D] = 0xD0, [0x7E] = 0xA1, [0x7F] = 0x07, + [0x80] = 0x20, [0x81] = 0x21, [0x82] = 0x22, [0x83] = 0x23, + [0x84] = 0x24, [0x85] = 0x15, [0x86] = 0x06, [0x87] = 0x17, + [0x88] = 0x28, [0x89] = 0x29, [0x8A] = 0x2A, [0x8B] = 0x2B, + [0x8C] = 0x2C, [0x8D] = 0x09, [0x8E] = 0x0A, [0x8F] = 0x1B, + [0x90] = 0x30, [0x91] = 0x31, [0x92] = 0x1A, [0x93] = 0x33, + [0x94] = 0x34, [0x95] = 0x35, [0x96] = 0x36, [0x97] = 0x08, + [0x98] = 0x38, [0x99] = 0x39, [0x9A] = 0x3A, [0x9B] = 0x3B, + [0x9C] = 0x04, [0x9D] = 0x14, [0x9E] = 0x3E, [0x9F] = 0xFF, + [0xA0] = 0x41, [0xA1] = 0xAA, [0xA2] = 0x4A, [0xA3] = 0xB1, + [0xA4] = 0x9F, [0xA5] = 0xB2, [0xA6] = 0x6A, [0xA7] = 0xB5, + [0xA8] = 0xBD, [0xA9] = 0xB4, [0xAA] = 0x9A, [0xAB] = 0x8A, + [0xAC] = 0x5F, [0xAD] = 0xCA, [0xAE] = 0xAF, [0xAF] = 0xBC, + [0xB0] = 0x90, [0xB1] = 0x8F, [0xB2] = 0xEA, [0xB3] = 0xFA, + [0xB4] = 0xBE, [0xB5] = 0xA0, [0xB6] = 0xB6, [0xB7] = 0xB3, + [0xB8] = 0x9D, [0xB9] = 0xDA, [0xBA] = 0x9B, [0xBB] = 0x8B, + [0xBC] = 0xB7, [0xBD] = 0xB8, [0xBE] = 0xB9, [0xBF] = 0xAB, + [0xC0] = 0x64, [0xC1] = 0x65, [0xC2] = 0x62, [0xC3] = 0x66, + [0xC4] = 0x63, [0xC5] = 0x67, [0xC6] = 0x9E, [0xC7] = 0x68, + [0xC8] = 0x74, [0xC9] = 0x71, [0xCA] = 0x72, [0xCB] = 0x73, + [0xCC] = 0x78, [0xCD] = 0x75, [0xCE] = 0x76, [0xCF] = 0x77, + [0xD0] = 0xAC, [0xD1] = 0x69, [0xD2] = 0xED, [0xD3] = 0xEE, + [0xD4] = 0xEB, [0xD5] = 0xEF, [0xD6] = 0xEC, [0xD7] = 0xBF, + [0xD8] = 0x80, [0xD9] = 0xFD, [0xDA] = 0xFE, [0xDB] = 0xFB, + [0xDC] = 0xFC, [0xDD] = 0xAD, [0xDE] = 0xAE, [0xDF] = 0x59, + [0xE0] = 0x44, [0xE1] = 0x45, [0xE2] = 0x42, [0xE3] = 0x46, + [0xE4] = 0x43, [0xE5] = 0x47, [0xE6] = 0x9C, [0xE7] = 0x48, + [0xE8] = 0x54, [0xE9] = 0x51, [0xEA] = 0x52, [0xEB] = 0x53, + [0xEC] = 0x58, [0xED] = 0x55, [0xEE] = 0x56, [0xEF] = 0x57, + [0xF0] = 0x8C, [0xF1] = 0x49, [0xF2] = 0xCD, [0xF3] = 0xCE, + [0xF4] = 0xCB, [0xF5] = 0xCF, [0xF6] = 0xCC, [0xF7] = 0xE1, + [0xF8] = 0x70, [0xF9] = 0xDD, [0xFA] = 0xDE, [0xFB] = 0xDB, + [0xFC] = 0xDC, [0xFD] = 0x8D, [0xFE] = 0x8E, [0xFF] = 0xDF +}; + +// conversion table from IBM037 to ISO-8859-1 +static const unsigned char table_cp037_iso8859_1[256] +__attribute__ ((aligned (8))) = +{ + [0x00] = 0x00, [0x01] = 0x01, [0x02] = 0x02, [0x03] = 0x03, + [0x04] = 0x9C, [0x05] = 0x09, [0x06] = 0x86, [0x07] = 0x7F, + [0x08] = 0x97, [0x09] = 0x8D, [0x0A] = 0x8E, [0x0B] = 0x0B, + [0x0C] = 0x0C, [0x0D] = 0x0D, [0x0E] = 0x0E, [0x0F] = 0x0F, + [0x10] = 0x10, [0x11] = 0x11, [0x12] = 0x12, [0x13] = 0x13, + [0x14] = 0x9D, [0x15] = 0x85, [0x16] = 0x08, [0x17] = 0x87, + [0x18] = 0x18, [0x19] = 0x19, [0x1A] = 0x92, [0x1B] = 0x8F, + [0x1C] = 0x1C, [0x1D] = 0x1D, [0x1E] = 0x1E, [0x1F] = 0x1F, + [0x20] = 0x80, [0x21] = 0x81, [0x22] = 0x82, [0x23] = 0x83, + [0x24] = 0x84, [0x25] = 0x0A, [0x26] = 0x17, [0x27] = 0x1B, + [0x28] = 0x88, [0x29] = 0x89, [0x2A] = 0x8A, [0x2B] = 0x8B, + [0x2C] = 0x8C, [0x2D] = 0x05, [0x2E] = 0x06, [0x2F] = 0x07, + [0x30] = 0x90, [0x31] = 0x91, [0x32] = 0x16, [0x33] = 0x93, + [0x34] = 0x94, [0x35] = 0x95, [0x36] = 0x96, [0x37] = 0x04, + [0x38] = 0x98, [0x39] = 0x99, [0x3A] = 0x9A, [0x3B] = 0x9B, + [0x3C] = 0x14, [0x3D] = 0x15, [0x3E] = 0x9E, [0x3F] = 0x1A, + [0x40] = 0x20, [0x41] = 0xA0, [0x42] = 0xE2, [0x43] = 0xE4, + [0x44] = 0xE0, [0x45] = 0xE1, [0x46] = 0xE3, [0x47] = 0xE5, + [0x48] = 0xE7, [0x49] = 0xF1, [0x4A] = 0xA2, [0x4B] = 0x2E, + [0x4C] = 0x3C, [0x4D] = 0x28, [0x4E] = 0x2B, [0x4F] = 0x7C, + [0x50] = 0x26, [0x51] = 0xE9, [0x52] = 0xEA, [0x53] = 0xEB, + [0x54] = 0xE8, [0x55] = 0xED, [0x56] = 0xEE, [0x57] = 0xEF, + [0x58] = 0xEC, [0x59] = 0xDF, [0x5A] = 0x21, [0x5B] = 0x24, + [0x5C] = 0x2A, [0x5D] = 0x29, [0x5E] = 0x3B, [0x5F] = 0xAC, + [0x60] = 0x2D, [0x61] = 0x2F, [0x62] = 0xC2, [0x63] = 0xC4, + [0x64] = 0xC0, [0x65] = 0xC1, [0x66] = 0xC3, [0x67] = 0xC5, + [0x68] = 0xC7, [0x69] = 0xD1, [0x6A] = 0xA6, [0x6B] = 0x2C, + [0x6C] = 0x25, [0x6D] = 0x5F, [0x6E] = 0x3E, [0x6F] = 0x3F, + [0x70] = 0xF8, [0x71] = 0xC9, [0x72] = 0xCA, [0x73] = 0xCB, + [0x74] = 0xC8, [0x75] = 0xCD, [0x76] = 0xCE, [0x77] = 0xCF, + [0x78] = 0xCC, [0x79] = 0x60, [0x7A] = 0x3A, [0x7B] = 0x23, + [0x7C] = 0x40, [0x7D] = 0x27, [0x7E] = 0x3D, [0x7F] = 0x22, + [0x80] = 0xD8, [0x81] = 0x61, [0x82] = 0x62, [0x83] = 0x63, + [0x84] = 0x64, [0x85] = 0x65, [0x86] = 0x66, [0x87] = 0x67, + [0x88] = 0x68, [0x89] = 0x69, [0x8A] = 0xAB, [0x8B] = 0xBB, + [0x8C] = 0xF0, [0x8D] = 0xFD, [0x8E] = 0xFE, [0x8F] = 0xB1, + [0x90] = 0xB0, [0x91] = 0x6A, [0x92] = 0x6B, [0x93] = 0x6C, + [0x94] = 0x6D, [0x95] = 0x6E, [0x96] = 0x6F, [0x97] = 0x70, + [0x98] = 0x71, [0x99] = 0x72, [0x9A] = 0xAA, [0x9B] = 0xBA, + [0x9C] = 0xE6, [0x9D] = 0xB8, [0x9E] = 0xC6, [0x9F] = 0xA4, + [0xA0] = 0xB5, [0xA1] = 0x7E, [0xA2] = 0x73, [0xA3] = 0x74, + [0xA4] = 0x75, [0xA5] = 0x76, [0xA6] = 0x77, [0xA7] = 0x78, + [0xA8] = 0x79, [0xA9] = 0x7A, [0xAA] = 0xA1, [0xAB] = 0xBF, + [0xAC] = 0xD0, [0xAD] = 0xDD, [0xAE] = 0xDE, [0xAF] = 0xAE, + [0xB0] = 0x5E, [0xB1] = 0xA3, [0xB2] = 0xA5, [0xB3] = 0xB7, + [0xB4] = 0xA9, [0xB5] = 0xA7, [0xB6] = 0xB6, [0xB7] = 0xBC, + [0xB8] = 0xBD, [0xB9] = 0xBE, [0xBA] = 0x5B, [0xBB] = 0x5D, + [0xBC] = 0xAF, [0xBD] = 0xA8, [0xBE] = 0xB4, [0xBF] = 0xD7, + [0xC0] = 0x7B, [0xC1] = 0x41, [0xC2] = 0x42, [0xC3] = 0x43, + [0xC4] = 0x44, [0xC5] = 0x45, [0xC6] = 0x46, [0xC7] = 0x47, + [0xC8] = 0x48, [0xC9] = 0x49, [0xCA] = 0xAD, [0xCB] = 0xF4, + [0xCC] = 0xF6, [0xCD] = 0xF2, [0xCE] = 0xF3, [0xCF] = 0xF5, + [0xD0] = 0x7D, [0xD1] = 0x4A, [0xD2] = 0x4B, [0xD3] = 0x4C, + [0xD4] = 0x4D, [0xD5] = 0x4E, [0xD6] = 0x4F, [0xD7] = 0x50, + [0xD8] = 0x51, [0xD9] = 0x52, [0xDA] = 0xB9, [0xDB] = 0xFB, + [0xDC] = 0xFC, [0xDD] = 0xF9, [0xDE] = 0xFA, [0xDF] = 0xFF, + [0xE0] = 0x5C, [0xE1] = 0xF7, [0xE2] = 0x53, [0xE3] = 0x54, + [0xE4] = 0x55, [0xE5] = 0x56, [0xE6] = 0x57, [0xE7] = 0x58, + [0xE8] = 0x59, [0xE9] = 0x5A, [0xEA] = 0xB2, [0xEB] = 0xD4, + [0xEC] = 0xD6, [0xED] = 0xD2, [0xEE] = 0xD3, [0xEF] = 0xD5, + [0xF0] = 0x30, [0xF1] = 0x31, [0xF2] = 0x32, [0xF3] = 0x33, + [0xF4] = 0x34, [0xF5] = 0x35, [0xF6] = 0x36, [0xF7] = 0x37, + [0xF8] = 0x38, [0xF9] = 0x39, [0xFA] = 0xB3, [0xFB] = 0xDB, + [0xFC] = 0xDC, [0xFD] = 0xD9, [0xFE] = 0xDA, [0xFF] = 0x9F +}; + +/* Definitions used in the body of the `gconv' function. */ +#define CHARSET_NAME "ISO-8859-1//" +#define FROM_LOOP iso8859_1_to_cp037_z900 +#define TO_LOOP cp037_to_iso8859_1_z900 +#define DEFINE_INIT 1 +#define DEFINE_FINI 1 +#define MIN_NEEDED_FROM 1 +#define MIN_NEEDED_TO 1 + +/* The Z900 variant of troo forces us to always specify a test + character which ends the translation. So if we run into the + situation where the translation has been interrupted due to the + test character we translate the character by hand and jump back + into the instruction. */ + +#define TROO_LOOP(TABLE) \ + { \ + register const unsigned char test asm ("0") = 0; \ + register const unsigned char *pTable asm ("1") = TABLE; \ + register unsigned char *pOutput asm ("2") = outptr; \ + register uint64_t length asm ("3"); \ + const unsigned char* pInput = inptr; \ + uint64_t tmp; \ + \ + length = (inend - inptr < outend - outptr \ + ? inend - inptr : outend - outptr); \ + \ + asm volatile ("0: \n\t" \ + " troo %0,%1 \n\t" \ + " jz 1f \n\t" \ + " jo 0b \n\t" \ + " llgc %3,0(%1) \n\t" \ + " la %3,0(%3,%4) \n\t" \ + " mvc 0(1,%0),0(%3) \n\t" \ + " aghi %1,1 \n\t" \ + " aghi %0,1 \n\t" \ + " aghi %2,-1 \n\t" \ + " j 0b \n\t" \ + "1: \n" \ + \ + : "+a" (pOutput), "+a" (pInput), "+d" (length), "=&a" (tmp) \ + : "a" (pTable), "d" (test) \ + : "cc"); \ + \ + inptr = pInput; \ + outptr = pOutput; \ + } + +/* First define the conversion function from ISO 8859-1 to CP037. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +#define BODY TROO_LOOP (table_iso8859_1_to_cp037) + +#include <iconv/loop.c> + + +/* Next, define the conversion function from CP037 to ISO 8859-1. */ +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define LOOPFCT TO_LOOP +#define BODY TROO_LOOP (table_cp037_iso8859_1); + +#include <iconv/loop.c> + + +/* Now define the toplevel functions. */ +#include <iconv/skeleton.c> diff --git a/sysdeps/s390/s390-64/utf16-utf32-z9.c b/sysdeps/s390/s390-64/utf16-utf32-z9.c new file mode 100644 index 0000000000..868dea68ca --- /dev/null +++ b/sysdeps/s390/s390-64/utf16-utf32-z9.c @@ -0,0 +1,325 @@ +/* Conversion between UTF-16 and UTF-32 BE/internal. + + This module uses the Z9-109 variants of the Convert Unicode + instructions. + Copyright (C) 1997-2009 Free Software Foundation, Inc. + + Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com> + Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997. + + Thanks to Daniel Appich who covered the relevant performance work + in his diploma thesis. + + This is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <dlfcn.h> +#include <stdint.h> +#include <unistd.h> +#include <dl-procinfo.h> +#include <gconv.h> + +/* UTF-32 big endian byte order mark. */ +#define BOM_UTF32 0x0000feffu + +/* UTF-16 big endian byte order mark. */ +#define BOM_UTF16 0xfeff + +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 2 +#define MAX_NEEDED_FROM 4 +#define MIN_NEEDED_TO 4 +#define FROM_LOOP from_utf16_loop +#define TO_LOOP to_utf16_loop +#define FROM_DIRECTION (dir == from_utf16) +#define PREPARE_LOOP \ + enum direction dir = ((struct utf16_data *) step->__data)->dir; \ + int emit_bom = ((struct utf16_data *) step->__data)->emit_bom; \ + \ + if (emit_bom && !data->__internal_use \ + && data->__invocation_counter == 0) \ + { \ + if (dir == to_utf16) \ + { \ + /* Emit the UTF-16 Byte Order Mark. */ \ + if (__builtin_expect (outbuf + 2 > outend, 0)) \ + return __GCONV_FULL_OUTPUT; \ + \ + put16u (outbuf, BOM_UTF16); \ + outbuf += 2; \ + } \ + else \ + { \ + /* Emit the UTF-32 Byte Order Mark. */ \ + if (__builtin_expect (outbuf + 4 > outend, 0)) \ + return __GCONV_FULL_OUTPUT; \ + \ + put32u (outbuf, BOM_UTF32); \ + outbuf += 4; \ + } \ + } + +/* Direction of the transformation. */ +enum direction +{ + illegal_dir, + to_utf16, + from_utf16 +}; + +struct utf16_data +{ + enum direction dir; + int emit_bom; +}; + + +extern int gconv_init (struct __gconv_step *step); +int +gconv_init (struct __gconv_step *step) +{ + /* Determine which direction. */ + struct utf16_data *new_data; + enum direction dir = illegal_dir; + int emit_bom; + int result; + + emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0 + || __strcasecmp (step->__to_name, "UTF-16//") == 0); + + if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0 + && (__strcasecmp (step->__to_name, "UTF-32//") == 0 + || __strcasecmp (step->__to_name, "UTF-32BE//") == 0 + || __strcasecmp (step->__to_name, "INTERNAL") == 0)) + { + dir = from_utf16; + } + else if ((__strcasecmp (step->__to_name, "UTF-16//") == 0 + || __strcasecmp (step->__to_name, "UTF-16BE//") == 0) + && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0 + || __strcasecmp (step->__from_name, "INTERNAL") == 0)) + { + dir = to_utf16; + } + + result = __GCONV_NOCONV; + if (dir != illegal_dir) + { + new_data = (struct utf16_data *) malloc (sizeof (struct utf16_data)); + + result = __GCONV_NOMEM; + if (new_data != NULL) + { + new_data->dir = dir; + new_data->emit_bom = emit_bom; + step->__data = new_data; + + if (dir == from_utf16) + { + step->__min_needed_from = MIN_NEEDED_FROM; + step->__max_needed_from = MIN_NEEDED_FROM; + step->__min_needed_to = MIN_NEEDED_TO; + step->__max_needed_to = MIN_NEEDED_TO; + } + else + { + step->__min_needed_from = MIN_NEEDED_TO; + step->__max_needed_from = MIN_NEEDED_TO; + step->__min_needed_to = MIN_NEEDED_FROM; + step->__max_needed_to = MIN_NEEDED_FROM; + } + + step->__stateful = 0; + + result = __GCONV_OK; + } + } + + return result; +} + + +extern void gconv_end (struct __gconv_step *data); +void +gconv_end (struct __gconv_step *data) +{ + free (data->__data); +} + +/* The macro for the hardware loop. This is used for both + directions. */ +#define HARDWARE_CONVERT(INSTRUCTION) \ + { \ + register const unsigned char* pInput asm ("8") = inptr; \ + register unsigned long long inlen asm ("9") = inend - inptr; \ + register unsigned char* pOutput asm ("10") = outptr; \ + register unsigned long long outlen asm("11") = outend - outptr; \ + uint64_t cc = 0; \ + \ + asm volatile ("0: " INSTRUCTION " \n\t" \ + " jo 0b \n\t" \ + " ipm %2 \n" \ + : "+a" (pOutput), "+a" (pInput), "+d" (cc), \ + "+d" (outlen), "+d" (inlen) \ + : \ + : "cc", "memory"); \ + \ + inptr = pInput; \ + outptr = pOutput; \ + cc >>= 28; \ + \ + if (cc == 1) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + else if (cc == 2) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } + +/* Conversion function from UTF-16 to UTF-32 internal/BE. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +/* The software routine is copied from utf-16.c (minus bytes + swapping). */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu24 %0, %1, 1"); \ + if (inptr != inend) \ + { \ + /* Check if the third byte is \ + a valid start of a UTF-16 surrogate. */ \ + if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \ + STANDARD_FROM_LOOP_ERR_HANDLER (3); \ + \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + continue; \ + } \ + \ + uint16_t u1 = get16 (inptr); \ + \ + if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff) \ + { \ + /* No surrogate. */ \ + put32 (outptr, u1); \ + inptr += 2; \ + } \ + else \ + { \ + /* It's a surrogate character. At least the first word says \ + it is. */ \ + if (__builtin_expect (inptr + 4 > inend, 0)) \ + { \ + /* We don't have enough input for another complete input \ + character. */ \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + inptr += 2; \ + uint16_t u2 = get16 (inptr); \ + if (__builtin_expect (u2 < 0xdc00, 0) \ + || __builtin_expect (u2 > 0xdfff, 0)) \ + { \ + /* This is no valid second word for a surrogate. */ \ + inptr -= 2; \ + STANDARD_FROM_LOOP_ERR_HANDLER (2); \ + } \ + \ + put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00)); \ + inptr += 2; \ + } \ + outptr += 4; \ + } +#define LOOP_NEED_FLAGS +#include <iconv/loop.c> + +/* Conversion from UTF-32 internal/BE to UTF-16. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +/* The software routine is copied from utf-16.c (minus bytes + swapping). */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu42 %0, %1"); \ + \ + if (inptr != inend) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + continue; \ + } \ + \ + uint32_t c = get32 (inptr); \ + \ + if (__builtin_expect (c <= 0xd7ff, 1) \ + || (c >=0xdc00 && c <= 0xffff)) \ + { \ + /* Two UTF-16 chars. */ \ + put16 (outptr, c); \ + } \ + else if (__builtin_expect (c >= 0x10000, 1) \ + && __builtin_expect (c <= 0x10ffff, 1)) \ + { \ + /* Four UTF-16 chars. */ \ + uint16_t zabcd = ((c & 0x1f0000) >> 16) - 1; \ + uint16_t out; \ + \ + /* Generate a surrogate character. */ \ + if (__builtin_expect (outptr + 4 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + out = 0xd800; \ + out |= (zabcd & 0xff) << 6; \ + out |= (c >> 10) & 0x3f; \ + put16 (outptr, out); \ + outptr += 2; \ + \ + out = 0xdc00; \ + out |= c & 0x3ff; \ + put16 (outptr, out); \ + } \ + else \ + { \ + STANDARD_TO_LOOP_ERR_HANDLER (4); \ + } \ + outptr += 2; \ + inptr += 4; \ + } +#define LOOP_NEED_FLAGS +#include <iconv/loop.c> + +#include <iconv/skeleton.c> diff --git a/sysdeps/s390/s390-64/utf8-utf16-z9.c b/sysdeps/s390/s390-64/utf8-utf16-z9.c new file mode 100644 index 0000000000..531d3ebd4b --- /dev/null +++ b/sysdeps/s390/s390-64/utf8-utf16-z9.c @@ -0,0 +1,463 @@ +/* Conversion between UTF-16 and UTF-32 BE/internal. + + This module uses the Z9-109 variants of the Convert Unicode + instructions. + Copyright (C) 1997-2009 Free Software Foundation, Inc. + + Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com> + Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997. + + Thanks to Daniel Appich who covered the relevant performance work + in his diploma thesis. + + This is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <dlfcn.h> +#include <stdint.h> +#include <unistd.h> +#include <dl-procinfo.h> +#include <gconv.h> + +/* UTF-16 big endian byte order mark. */ +#define BOM_UTF16 0xfeff + +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 4 +#define MIN_NEEDED_TO 2 +#define MAX_NEEDED_TO 4 +#define FROM_LOOP from_utf8_loop +#define TO_LOOP to_utf8_loop +#define FROM_DIRECTION (dir == from_utf8) +#define PREPARE_LOOP \ + enum direction dir = ((struct utf8_data *) step->__data)->dir; \ + int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \ + \ + if (emit_bom && !data->__internal_use \ + && data->__invocation_counter == 0) \ + { \ + /* Emit the UTF-16 Byte Order Mark. */ \ + if (__builtin_expect (outbuf + 2 > outend, 0)) \ + return __GCONV_FULL_OUTPUT; \ + \ + put16u (outbuf, BOM_UTF16); \ + outbuf += 2; \ + } + +/* Direction of the transformation. */ +enum direction +{ + illegal_dir, + to_utf8, + from_utf8 +}; + +struct utf8_data +{ + enum direction dir; + int emit_bom; +}; + + +extern int gconv_init (struct __gconv_step *step); +int +gconv_init (struct __gconv_step *step) +{ + /* Determine which direction. */ + struct utf8_data *new_data; + enum direction dir = illegal_dir; + int emit_bom; + int result; + + emit_bom = (__strcasecmp (step->__to_name, "UTF-16//") == 0); + + if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0 + && (__strcasecmp (step->__to_name, "UTF-16//") == 0 + || __strcasecmp (step->__to_name, "UTF-16BE//") == 0)) + { + dir = from_utf8; + } + else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0 + && __strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0) + { + dir = to_utf8; + } + + result = __GCONV_NOCONV; + if (dir != illegal_dir) + { + new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data)); + + result = __GCONV_NOMEM; + if (new_data != NULL) + { + new_data->dir = dir; + new_data->emit_bom = emit_bom; + step->__data = new_data; + + if (dir == from_utf8) + { + step->__min_needed_from = MIN_NEEDED_FROM; + step->__max_needed_from = MIN_NEEDED_FROM; + step->__min_needed_to = MIN_NEEDED_TO; + step->__max_needed_to = MIN_NEEDED_TO; + } + else + { + step->__min_needed_from = MIN_NEEDED_TO; + step->__max_needed_from = MIN_NEEDED_TO; + step->__min_needed_to = MIN_NEEDED_FROM; + step->__max_needed_to = MIN_NEEDED_FROM; + } + + step->__stateful = 0; + + result = __GCONV_OK; + } + } + + return result; +} + + +extern void gconv_end (struct __gconv_step *data); +void +gconv_end (struct __gconv_step *data) +{ + free (data->__data); +} + +/* The macro for the hardware loop. This is used for both + directions. */ +#define HARDWARE_CONVERT(INSTRUCTION) \ + { \ + register const unsigned char* pInput asm ("8") = inptr; \ + register unsigned long long inlen asm ("9") = inend - inptr; \ + register unsigned char* pOutput asm ("10") = outptr; \ + register unsigned long long outlen asm("11") = outend - outptr; \ + uint64_t cc = 0; \ + \ + asm volatile ("0: " INSTRUCTION " \n\t" \ + " jo 0b \n\t" \ + " ipm %2 \n" \ + : "+a" (pOutput), "+a" (pInput), "+d" (cc), \ + "+d" (outlen), "+d" (inlen) \ + : \ + : "cc", "memory"); \ + \ + inptr = pInput; \ + outptr = pOutput; \ + cc >>= 28; \ + \ + if (cc == 1) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + else if (cc == 2) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } + +/* Conversion function from UTF-8 to UTF-16. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +/* The software implementation is based on the code in gconv_simple.c. */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu12 %0, %1, 1"); \ + \ + if (inptr != inend) \ + { \ + int i; \ + for (i = 1; inptr + i < inend; ++i) \ + if ((inptr[i] & 0xc0) != 0x80) \ + break; \ + \ + if (__builtin_expect (inptr + i == inend, 1)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + STANDARD_FROM_LOOP_ERR_HANDLER (i); \ + } \ + continue; \ + } \ + \ + /* Next input byte. */ \ + uint16_t ch = *inptr; \ + \ + if (__builtin_expect (ch < 0x80, 1)) \ + { \ + /* One byte sequence. */ \ + ++inptr; \ + } \ + else \ + { \ + uint_fast32_t cnt; \ + uint_fast32_t i; \ + \ + if (ch >= 0xc2 && ch < 0xe0) \ + { \ + /* We expect two bytes. The first byte cannot be 0xc0 \ + or 0xc1, otherwise the wide character could have been \ + represented using a single byte. */ \ + cnt = 2; \ + ch &= 0x1f; \ + } \ + else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ + { \ + /* We expect three bytes. */ \ + cnt = 3; \ + ch &= 0x0f; \ + } \ + else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ + { \ + /* We expect four bytes. */ \ + cnt = 4; \ + ch &= 0x07; \ + } \ + else \ + { \ + /* Search the end of this ill-formed UTF-8 character. This \ + is the next byte with (x & 0xc0) != 0x80. */ \ + i = 0; \ + do \ + ++i; \ + while (inptr + i < inend \ + && (*(inptr + i) & 0xc0) == 0x80 \ + && i < 5); \ + \ + errout: \ + STANDARD_FROM_LOOP_ERR_HANDLER (i); \ + } \ + \ + if (__builtin_expect (inptr + cnt > inend, 0)) \ + { \ + /* We don't have enough input. But before we report \ + that check that all the bytes are correct. */ \ + for (i = 1; inptr + i < inend; ++i) \ + if ((inptr[i] & 0xc0) != 0x80) \ + break; \ + \ + if (__builtin_expect (inptr + i == inend, 1)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + goto errout; \ + } \ + \ + if (cnt == 4) \ + { \ + /* For 4 byte UTF-8 chars two UTF-16 chars (high and \ + low) are needed. */ \ + uint16_t zabcd, high, low; \ + \ + if (__builtin_expect (outptr + 4 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + /* See Principles of Operations cu12. */ \ + zabcd = (((inptr[0] & 0x7) << 2) | \ + ((inptr[1] & 0x30) >> 4)) - 1; \ + \ + /* z-bit must be zero after subtracting 1. */ \ + if (zabcd & 0x10) \ + STANDARD_FROM_LOOP_ERR_HANDLER (4) \ + \ + high = (uint16_t)(0xd8 << 8); /* high surrogate id */ \ + high |= zabcd << 6; /* abcd bits */ \ + high |= (inptr[1] & 0xf) << 2; /* efgh bits */ \ + high |= (inptr[2] & 0x30) >> 4; /* ij bits */ \ + \ + low = (uint16_t)(0xdc << 8); /* low surrogate id */ \ + low |= ((uint16_t)inptr[2] & 0xc) << 6; /* kl bits */ \ + low |= (inptr[2] & 0x3) << 6; /* mn bits */ \ + low |= inptr[3] & 0x3f; /* opqrst bits */ \ + \ + put16 (outptr, high); \ + outptr += 2; \ + put16 (outptr, low); \ + outptr += 2; \ + inptr += 4; \ + continue; \ + } \ + else \ + { \ + /* Read the possible remaining bytes. */ \ + for (i = 1; i < cnt; ++i) \ + { \ + uint16_t byte = inptr[i]; \ + \ + if ((byte & 0xc0) != 0x80) \ + /* This is an illegal encoding. */ \ + break; \ + \ + ch <<= 6; \ + ch |= byte & 0x3f; \ + } \ + inptr += cnt; \ + \ + } \ + } \ + /* Now adjust the pointers and store the result. */ \ + *((uint16_t *) outptr) = ch; \ + outptr += sizeof (uint16_t); \ + } + +#define LOOP_NEED_FLAGS +#include <iconv/loop.c> + +/* Conversion from UTF-16 to UTF-8. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +/* The software routine is based on the functionality of the S/390 + hardware instruction (cu21) as described in the Principles of + Operation. */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu21 %0, %1"); \ + if (inptr != inend) \ + { \ + /* Check if the third byte is \ + a valid start of a UTF-16 surrogate. */ \ + if (inend - inptr == 3 && (inptr[3] & 0xfc) != 0xdc) \ + STANDARD_TO_LOOP_ERR_HANDLER (3); \ + \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + continue; \ + } \ + \ + uint16_t c = get16 (inptr); \ + \ + if (__builtin_expect (c <= 0x007f, 1)) \ + { \ + /* Single byte UTF-8 char. */ \ + *outptr = c & 0xff; \ + outptr++; \ + } \ + else if (c >= 0x0080 && c <= 0x07ff) \ + { \ + /* Two byte UTF-8 char. */ \ + \ + if (__builtin_expect (outptr + 2 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + outptr[0] = 0xc0; \ + outptr[0] |= c >> 6; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= c & 0x3f; \ + \ + outptr += 2; \ + } \ + else if (c >= 0x0800 && c <= 0xd7ff) \ + { \ + /* Three byte UTF-8 char. */ \ + \ + if (__builtin_expect (outptr + 3 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + outptr[0] = 0xe0; \ + outptr[0] |= c >> 12; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= (c >> 6) & 0x3f; \ + \ + outptr[2] = 0x80; \ + outptr[2] |= c & 0x3f; \ + \ + outptr += 3; \ + } \ + else if (c >= 0xd800 && c <= 0xdbff) \ + { \ + /* Four byte UTF-8 char. */ \ + uint16_t low, uvwxy; \ + \ + if (__builtin_expect (outptr + 4 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + inptr += 2; \ + if (__builtin_expect (inptr + 2 > inend, 0)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + low = get16 (inptr); \ + \ + if ((low & 0xfc00) != 0xdc00) \ + { \ + inptr -= 2; \ + STANDARD_TO_LOOP_ERR_HANDLER (2); \ + } \ + uvwxy = ((c >> 6) & 0xf) + 1; \ + outptr[0] = 0xf0; \ + outptr[0] |= uvwxy >> 2; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= (uvwxy << 4) & 0x30; \ + outptr[1] |= (c >> 2) & 0x0f; \ + \ + outptr[2] = 0x80; \ + outptr[2] |= (c & 0x03) << 4; \ + outptr[2] |= (low >> 6) & 0x0f; \ + \ + outptr[3] = 0x80; \ + outptr[3] |= low & 0x3f; \ + \ + outptr += 4; \ + } \ + else \ + { \ + STANDARD_TO_LOOP_ERR_HANDLER (2); \ + } \ + inptr += 2; \ + } +#define LOOP_NEED_FLAGS +#include <iconv/loop.c> + +#include <iconv/skeleton.c> diff --git a/sysdeps/s390/s390-64/utf8-utf32-z9.c b/sysdeps/s390/s390-64/utf8-utf32-z9.c new file mode 100644 index 0000000000..17ef8bc890 --- /dev/null +++ b/sysdeps/s390/s390-64/utf8-utf32-z9.c @@ -0,0 +1,508 @@ +/* Conversion between UTF-8 and UTF-32 BE/internal. + + This module uses the Z9-109 variants of the Convert Unicode + instructions. + Copyright (C) 1997-2009 Free Software Foundation, Inc. + + Author: Andreas Krebbel <Andreas.Krebbel@de.ibm.com> + Based on the work by Ulrich Drepper <drepper@cygnus.com>, 1997. + + Thanks to Daniel Appich who covered the relevant performance work + in his diploma thesis. + + This is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <dlfcn.h> +#include <stdint.h> +#include <unistd.h> +#include <dl-procinfo.h> +#include <gconv.h> + +/* UTF-32 big endian byte order mark. */ +#define BOM 0x0000feffu + +#define DEFINE_INIT 0 +#define DEFINE_FINI 0 +/* These definitions apply to the UTF-8 to UTF-32 direction. The + software implementation for UTF-8 still supports multibyte + characters up to 6 bytes whereas the hardware variant does not. */ +#define MIN_NEEDED_FROM 1 +#define MAX_NEEDED_FROM 6 +#define MIN_NEEDED_TO 4 +#define FROM_LOOP from_utf8_loop +#define TO_LOOP to_utf8_loop +#define FROM_DIRECTION (dir == from_utf8) +#define PREPARE_LOOP \ + enum direction dir = ((struct utf8_data *) step->__data)->dir; \ + int emit_bom = ((struct utf8_data *) step->__data)->emit_bom; \ + \ + if (emit_bom && !data->__internal_use \ + && data->__invocation_counter == 0) \ + { \ + /* Emit the Byte Order Mark. */ \ + if (__builtin_expect (outbuf + 4 > outend, 0)) \ + return __GCONV_FULL_OUTPUT; \ + \ + put32u (outbuf, BOM); \ + outbuf += 4; \ + } + +/* Direction of the transformation. */ +enum direction +{ + illegal_dir, + to_utf8, + from_utf8 +}; + +struct utf8_data +{ + enum direction dir; + int emit_bom; +}; + + +extern int gconv_init (struct __gconv_step *step); +int +gconv_init (struct __gconv_step *step) +{ + /* Determine which direction. */ + struct utf8_data *new_data; + enum direction dir = illegal_dir; + int emit_bom; + int result; + + emit_bom = (__strcasecmp (step->__to_name, "UTF-32//") == 0); + + if (__strcasecmp (step->__from_name, "ISO-10646/UTF8/") == 0 + && (__strcasecmp (step->__to_name, "UTF-32//") == 0 + || __strcasecmp (step->__to_name, "UTF-32BE//") == 0 + || __strcasecmp (step->__to_name, "INTERNAL") == 0)) + { + dir = from_utf8; + } + else if (__strcasecmp (step->__to_name, "ISO-10646/UTF8/") == 0 + && (__strcasecmp (step->__from_name, "UTF-32BE//") == 0 + || __strcasecmp (step->__from_name, "INTERNAL") == 0)) + { + dir = to_utf8; + } + + result = __GCONV_NOCONV; + if (dir != illegal_dir) + { + new_data = (struct utf8_data *) malloc (sizeof (struct utf8_data)); + + result = __GCONV_NOMEM; + if (new_data != NULL) + { + new_data->dir = dir; + new_data->emit_bom = emit_bom; + step->__data = new_data; + + if (dir == from_utf8) + { + step->__min_needed_from = MIN_NEEDED_FROM; + step->__max_needed_from = MIN_NEEDED_FROM; + step->__min_needed_to = MIN_NEEDED_TO; + step->__max_needed_to = MIN_NEEDED_TO; + } + else + { + step->__min_needed_from = MIN_NEEDED_TO; + step->__max_needed_from = MIN_NEEDED_TO; + step->__min_needed_to = MIN_NEEDED_FROM; + step->__max_needed_to = MIN_NEEDED_FROM; + } + + step->__stateful = 0; + + result = __GCONV_OK; + } + } + + return result; +} + + +extern void gconv_end (struct __gconv_step *data); +void +gconv_end (struct __gconv_step *data) +{ + free (data->__data); +} + +/* The macro for the hardware loop. This is used for both + directions. */ +#define HARDWARE_CONVERT(INSTRUCTION) \ + { \ + register const unsigned char* pInput asm ("8") = inptr; \ + register unsigned long long inlen asm ("9") = inend - inptr; \ + register unsigned char* pOutput asm ("10") = outptr; \ + register unsigned long long outlen asm("11") = outend - outptr; \ + uint64_t cc = 0; \ + \ + asm volatile ("0: " INSTRUCTION " \n\t" \ + " jo 0b \n\t" \ + " ipm %2 \n" \ + : "+a" (pOutput), "+a" (pInput), "+d" (cc), \ + "+d" (outlen), "+d" (inlen) \ + : \ + : "cc", "memory"); \ + \ + inptr = pInput; \ + outptr = pOutput; \ + cc >>= 28; \ + \ + if (cc == 1) \ + { \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + else if (cc == 2) \ + { \ + result = __GCONV_ILLEGAL_INPUT; \ + break; \ + } \ + } + +/* Conversion function from UTF-8 to UTF-32 internal/BE. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_FROM +#define MAX_NEEDED_INPUT MAX_NEEDED_FROM +#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO +#define LOOPFCT FROM_LOOP +/* The software routine is copied from gconv_simple.c. */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu14 %0, %1, 1"); \ + \ + if (inptr != inend) \ + { \ + int i; \ + for (i = 1; inptr + i < inend; ++i) \ + if ((inptr[i] & 0xc0) != 0x80) \ + break; \ + \ + if (__builtin_expect (inptr + i == inend, 1)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + STANDARD_FROM_LOOP_ERR_HANDLER (i); \ + } \ + continue; \ + } \ + \ + /* Next input byte. */ \ + uint32_t ch = *inptr; \ + \ + if (__builtin_expect (ch < 0x80, 1)) \ + { \ + /* One byte sequence. */ \ + ++inptr; \ + } \ + else \ + { \ + uint_fast32_t cnt; \ + uint_fast32_t i; \ + \ + if (ch >= 0xc2 && ch < 0xe0) \ + { \ + /* We expect two bytes. The first byte cannot be 0xc0 or \ + 0xc1, otherwise the wide character could have been \ + represented using a single byte. */ \ + cnt = 2; \ + ch &= 0x1f; \ + } \ + else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ + { \ + /* We expect three bytes. */ \ + cnt = 3; \ + ch &= 0x0f; \ + } \ + else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ + { \ + /* We expect four bytes. */ \ + cnt = 4; \ + ch &= 0x07; \ + } \ + else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ + { \ + /* We expect five bytes. */ \ + cnt = 5; \ + ch &= 0x03; \ + } \ + else if (__builtin_expect ((ch & 0xfe) == 0xfc, 1)) \ + { \ + /* We expect six bytes. */ \ + cnt = 6; \ + ch &= 0x01; \ + } \ + else \ + { \ + /* Search the end of this ill-formed UTF-8 character. This \ + is the next byte with (x & 0xc0) != 0x80. */ \ + i = 0; \ + do \ + ++i; \ + while (inptr + i < inend \ + && (*(inptr + i) & 0xc0) == 0x80 \ + && i < 5); \ + \ + errout: \ + STANDARD_FROM_LOOP_ERR_HANDLER (i); \ + } \ + \ + if (__builtin_expect (inptr + cnt > inend, 0)) \ + { \ + /* We don't have enough input. But before we report \ + that check that all the bytes are correct. */ \ + for (i = 1; inptr + i < inend; ++i) \ + if ((inptr[i] & 0xc0) != 0x80) \ + break; \ + \ + if (__builtin_expect (inptr + i == inend, 1)) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + \ + goto errout; \ + } \ + \ + /* Read the possible remaining bytes. */ \ + for (i = 1; i < cnt; ++i) \ + { \ + uint32_t byte = inptr[i]; \ + \ + if ((byte & 0xc0) != 0x80) \ + /* This is an illegal encoding. */ \ + break; \ + \ + ch <<= 6; \ + ch |= byte & 0x3f; \ + } \ + \ + /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \ + If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \ + have been represented with fewer than cnt bytes. */ \ + if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0)) \ + { \ + /* This is an illegal encoding. */ \ + goto errout; \ + } \ + \ + inptr += cnt; \ + } \ + \ + /* Now adjust the pointers and store the result. */ \ + *((uint32_t *) outptr) = ch; \ + outptr += sizeof (uint32_t); \ + } +#define LOOP_NEED_FLAGS + +#define STORE_REST \ + { \ + /* We store the remaining bytes while converting them into the UCS4 \ + format. We can assume that the first byte in the buffer is \ + correct and that it requires a larger number of bytes than there \ + are in the input buffer. */ \ + wint_t ch = **inptrp; \ + size_t cnt, r; \ + \ + state->__count = inend - *inptrp; \ + \ + if (ch >= 0xc2 && ch < 0xe0) \ + { \ + /* We expect two bytes. The first byte cannot be 0xc0 or \ + 0xc1, otherwise the wide character could have been \ + represented using a single byte. */ \ + cnt = 2; \ + ch &= 0x1f; \ + } \ + else if (__builtin_expect ((ch & 0xf0) == 0xe0, 1)) \ + { \ + /* We expect three bytes. */ \ + cnt = 3; \ + ch &= 0x0f; \ + } \ + else if (__builtin_expect ((ch & 0xf8) == 0xf0, 1)) \ + { \ + /* We expect four bytes. */ \ + cnt = 4; \ + ch &= 0x07; \ + } \ + else if (__builtin_expect ((ch & 0xfc) == 0xf8, 1)) \ + { \ + /* We expect five bytes. */ \ + cnt = 5; \ + ch &= 0x03; \ + } \ + else \ + { \ + /* We expect six bytes. */ \ + cnt = 6; \ + ch &= 0x01; \ + } \ + \ + /* The first byte is already consumed. */ \ + r = cnt - 1; \ + while (++(*inptrp) < inend) \ + { \ + ch <<= 6; \ + ch |= **inptrp & 0x3f; \ + --r; \ + } \ + \ + /* Shift for the so far missing bytes. */ \ + ch <<= r * 6; \ + \ + /* Store the number of bytes expected for the entire sequence. */ \ + state->__count |= cnt << 8; \ + \ + /* Store the value. */ \ + state->__value.__wch = ch; \ + } + +#define UNPACK_BYTES \ + { \ + static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \ + wint_t wch = state->__value.__wch; \ + size_t ntotal = state->__count >> 8; \ + \ + inlen = state->__count & 255; \ + \ + bytebuf[0] = inmask[ntotal - 2]; \ + \ + do \ + { \ + if (--ntotal < inlen) \ + bytebuf[ntotal] = 0x80 | (wch & 0x3f); \ + wch >>= 6; \ + } \ + while (ntotal > 1); \ + \ + bytebuf[0] |= wch; \ + } + +#define CLEAR_STATE \ + state->__count = 0 + +#include <iconv/loop.c> + +/* Conversion from UTF-32 internal/BE to UTF-8. */ + +#define MIN_NEEDED_INPUT MIN_NEEDED_TO +#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM +#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM +#define LOOPFCT TO_LOOP +/* The software routine mimics the S/390 cu41 instruction. */ +#define BODY \ + { \ + if (GLRO (dl_hwcap) & HWCAP_S390_ETF3EH) \ + { \ + HARDWARE_CONVERT ("cu41 %0, %1"); \ + \ + if (inptr != inend) \ + { \ + result = __GCONV_INCOMPLETE_INPUT; \ + break; \ + } \ + continue; \ + } \ + \ + uint32_t wc = *((const uint32_t *) inptr); \ + \ + if (__builtin_expect (wc <= 0x7f, 1)) \ + { \ + /* Single UTF-8 char. */ \ + *outptr = (uint8_t)wc; \ + outptr++; \ + } \ + else if (wc <= 0x7ff) \ + { \ + /* Two UTF-8 chars. */ \ + if (__builtin_expect (outptr + 2 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + \ + outptr[0] = 0xc0; \ + outptr[0] |= wc >> 6; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= wc & 0x3f; \ + \ + outptr += 2; \ + } \ + else if (wc <= 0xffff) \ + { \ + /* Three UTF-8 chars. */ \ + if (__builtin_expect (outptr + 3 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + outptr[0] = 0xe0; \ + outptr[0] |= wc >> 12; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= (wc >> 6) & 0x3f; \ + \ + outptr[2] = 0x80; \ + outptr[2] |= wc & 0x3f; \ + \ + outptr += 3; \ + } \ + else if (wc <= 0x10ffff) \ + { \ + /* Four UTF-8 chars. */ \ + if (__builtin_expect (outptr + 4 > outend, 0)) \ + { \ + /* Overflow in the output buffer. */ \ + result = __GCONV_FULL_OUTPUT; \ + break; \ + } \ + outptr[0] = 0xf0; \ + outptr[0] |= wc >> 18; \ + \ + outptr[1] = 0x80; \ + outptr[1] |= (wc >> 12) & 0x3f; \ + \ + outptr[2] = 0x80; \ + outptr[2] |= (wc >> 6) & 0x3f; \ + \ + outptr[3] = 0x80; \ + outptr[3] |= wc & 0x3f; \ + \ + outptr += 4; \ + } \ + else \ + { \ + STANDARD_TO_LOOP_ERR_HANDLER (4); \ + } \ + inptr += 4; \ + } +#define LOOP_NEED_FLAGS +#include <iconv/loop.c> + +#include <iconv/skeleton.c> diff --git a/sysdeps/unix/sysv/linux/configure b/sysdeps/unix/sysv/linux/configure index 253e9c57ff..199457a3ac 100644 --- a/sysdeps/unix/sysv/linux/configure +++ b/sysdeps/unix/sysv/linux/configure @@ -1,17 +1,6 @@ # This file is generated from configure.in by Autoconf. DO NOT EDIT! # Local configure fragment for sysdeps/unix/sysv/linux. -# The Linux kernel headers can be found in -# /lib/modules/$(uname -r)/build/include -# Check whether this directory is available. -if test -z "$sysheaders" && - test "x$cross_compiling" = xno && - test -d /lib/modules/`uname -r`/build/include; then - sysheaders="/lib/modules/`uname -r`/build/include" - ccheaders=`$CC -print-file-name=include` - SYSINCLUDES="-I $sysheaders" -fi - # Don't bother trying to generate any glue code to be compatible with the # existing system library, because we are the only system library. inhibit_glue=yes diff --git a/sysdeps/unix/sysv/linux/configure.in b/sysdeps/unix/sysv/linux/configure.in index 5330e98c2d..8f00407a8b 100644 --- a/sysdeps/unix/sysv/linux/configure.in +++ b/sysdeps/unix/sysv/linux/configure.in @@ -1,19 +1,6 @@ GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. # Local configure fragment for sysdeps/unix/sysv/linux. -# The Linux kernel headers can be found in -# /lib/modules/$(uname -r)/build/include -# Check whether this directory is available. -if test -z "$sysheaders" && - test "x$cross_compiling" = xno && - test -d /lib/modules/`uname -r`/build/include; then - sysheaders="/lib/modules/`uname -r`/build/include" - ccheaders=`$CC -print-file-name=include` - dnl We don't have to use -nostdinc. We just want one more directory - dnl to be used. - SYSINCLUDES="-I $sysheaders" -fi - # Don't bother trying to generate any glue code to be compatible with the # existing system library, because we are the only system library. inhibit_glue=yes diff --git a/sysdeps/unix/sysv/linux/eventfd.c b/sysdeps/unix/sysv/linux/eventfd.c index 4cd557983e..7f69ecdb8c 100644 --- a/sysdeps/unix/sysv/linux/eventfd.c +++ b/sysdeps/unix/sysv/linux/eventfd.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -19,14 +19,21 @@ #include <errno.h> #include <sys/eventfd.h> #include <sysdep.h> +#include <kernel-features.h> int eventfd (int count, int flags) { #ifdef __NR_eventfd2 - return INLINE_SYSCALL (eventfd2, 2, count, flags); -#else + int res = INLINE_SYSCALL (eventfd2, 2, count, flags); +# ifndef __ASSUME_EVENTFD2 + if (res != -1 || errno != ENOSYS) +# endif + return res; +#endif + +#ifndef __ASSUME_EVENTFD2 /* The old system call has no flag parameter which is bad. So we have to wait until we have to support to pass additional values to the kernel (sys_indirect) before implementing setting flags like @@ -43,5 +50,7 @@ eventfd (int count, int flags) __set_errno (ENOSYS); return -1; # endif +#elif !defined __NR_eventfd2 +# error "__ASSUME_EVENTFD2 defined but not __NR_eventfd2" #endif } diff --git a/sysdeps/unix/sysv/linux/i386/sysconf.c b/sysdeps/unix/sysv/linux/i386/sysconf.c index efe1a639cd..ff3cf9f7c7 100644 --- a/sysdeps/unix/sysv/linux/i386/sysconf.c +++ b/sysdeps/unix/sysv/linux/i386/sysconf.c @@ -138,6 +138,9 @@ static const struct intel_02_cache_info { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 }, { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 }, { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 }, + { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 }, + { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 }, + { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 }, }; #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known[0])) diff --git a/sysdeps/unix/sysv/linux/kernel-features.h b/sysdeps/unix/sysv/linux/kernel-features.h index 4562515790..ff065effb5 100644 --- a/sysdeps/unix/sysv/linux/kernel-features.h +++ b/sysdeps/unix/sysv/linux/kernel-features.h @@ -516,6 +516,8 @@ # define __ASSUME_SOCK_CLOEXEC 1 # define __ASSUME_IN_NONBLOCK 1 # define __ASSUME_PIPE2 1 +# define __ASSUME_EVENTFD2 1 +# define __ASSUME_SIGNALFD4 1 #endif /* Support for the accept4 syscall was added in 2.6.28. */ diff --git a/sysdeps/unix/sysv/linux/signalfd.c b/sysdeps/unix/sysv/linux/signalfd.c index 9898f29231..c2d974a45d 100644 --- a/sysdeps/unix/sysv/linux/signalfd.c +++ b/sysdeps/unix/sysv/linux/signalfd.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -20,14 +20,21 @@ #include <signal.h> #include <sys/signalfd.h> #include <sysdep.h> +#include <kernel-features.h> int signalfd (int fd, const sigset_t *mask, int flags) { #ifdef __NR_signalfd4 - return INLINE_SYSCALL (signalfd4, 4, fd, mask, _NSIG / 8, flags); -#else + int res = INLINE_SYSCALL (signalfd4, 4, fd, mask, _NSIG / 8, flags); +# ifndef __ASSUME_SIGNALFD4 + if (res != -1 || errno != ENOSYS) +# endif + return res; +#endif + +#ifndef __ASSUME_SIGNALFD4 /* The old system call has no flag parameter which is bad. So we have to wait until we have to support to pass additional values to the kernel (sys_indirect) before implementing setting flags like @@ -44,5 +51,7 @@ signalfd (int fd, const sigset_t *mask, int flags) __set_errno (ENOSYS); return -1; # endif +#elif !defined __NR_signalfd4 +# error "__ASSUME_SIGNALFD4 defined but not __NR_signalfd4" #endif } diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index 78fdb04fcb..57cd88432a 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -19,6 +19,10 @@ ifeq ($(subdir),elf) sysdep-dl-routines += tlsdesc dl-tlsdesc sysdep_routines += tlsdesc dl-tlsdesc sysdep-rtld-routines += tlsdesc dl-tlsdesc + +tests: $(objpfx)tst-xmmymm.out +$(objpfx)tst-xmmymm.out: ../sysdeps/x86_64/tst-xmmymm.sh $(objpfx)ld.so + $(SHELL) -e $< $(objpfx) > $@ endif ifeq ($(subdir),csu) diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c index 362687c181..75b81958dd 100644 --- a/sysdeps/x86_64/cacheinfo.c +++ b/sysdeps/x86_64/cacheinfo.c @@ -25,6 +25,17 @@ #ifdef USE_MULTIARCH # include "multiarch/init-arch.h" + +# define is_intel __cpu_features.kind == arch_kind_intel +# define is_amd __cpu_features.kind == arch_kind_amd +# define max_cpuid __cpu_features.max_cpuid +#else + /* This spells out "GenuineIntel". */ +# define is_intel \ + ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69 + /* This spells out "AuthenticAMD". */ +# define is_amd \ + ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65 #endif static const struct intel_02_cache_info @@ -100,6 +111,9 @@ static const struct intel_02_cache_info { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 }, { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 }, { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 }, + { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 }, + { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 }, + { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 }, }; #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0])) @@ -152,6 +166,12 @@ intel_check_word (int name, unsigned int value, bool *has_level_2, /* Intel reused this value. For family 15, model 6 it specifies the 3rd level cache. Otherwise the 2nd level cache. */ + unsigned int family; + unsigned int model; +#ifdef USE_MULTIARCH + family = __cpu_features.family; + model = __cpu_features.model; +#else unsigned int eax; unsigned int ebx; unsigned int ecx; @@ -160,9 +180,10 @@ intel_check_word (int name, unsigned int value, bool *has_level_2, : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (1)); - unsigned int family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf); - unsigned int model = ((((eax >>16) & 0xf) << 4) - + ((eax >> 4) & 0xf)); + family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf); + model = (((eax >>16) & 0xf) << 4) + ((eax >> 4) & 0xf); +#endif + if (family == 15 && model == 6) { /* The level 3 cache is encoded for this model like @@ -394,21 +415,24 @@ long int attribute_hidden __cache_sysconf (int name) { +#ifdef USE_MULTIARCH + if (__cpu_features.kind == arch_kind_unknown) + __init_cpu_features (); +#else /* Find out what brand of processor. */ - unsigned int eax; + unsigned int max_cpuid; unsigned int ebx; unsigned int ecx; unsigned int edx; asm volatile ("cpuid" - : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + : "=a" (max_cpuid), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (0)); +#endif - /* This spells out "GenuineIntel". */ - if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69) - return handle_intel (name, eax); + if (is_intel) + return handle_intel (name, max_cpuid); - /* This spells out "AuthenticAMD". */ - if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) + if (is_amd) return handle_amd (name); // XXX Fill in more vendors. @@ -457,20 +481,11 @@ init_cacheinfo (void) #ifdef USE_MULTIARCH if (__cpu_features.kind == arch_kind_unknown) __init_cpu_features (); -# define is_intel __cpu_features.kind == arch_kind_intel -# define is_amd __cpu_features.kind == arch_kind_amd -# define max_cpuid __cpu_features.max_cpuid #else int max_cpuid; asm volatile ("cpuid" : "=a" (max_cpuid), "=b" (ebx), "=c" (ecx), "=d" (edx) : "0" (0)); - /* This spells out "GenuineIntel". */ -# define is_intel \ - ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69 - /* This spells out "AuthenticAMD". */ -# define is_amd \ - ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65 #endif if (is_intel) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 5ce14aad8d..b066402204 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,7 +4,7 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) -sysdep_routines += stpncpy-c strncpy-c strncmp-c +sysdep_routines += stpncpy-c strncpy-c ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 29e687344d..35fd19af0e 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -68,7 +68,13 @@ __init_cpu_features (void) __cpu_features.model += extended_model; } else if (__cpu_features.family == 0x06) - __cpu_features.model += extended_model; + { + __cpu_features.model += extended_model; + + if (__cpu_features.model == 0x1c) + /* Avoid SSSE3 on Atom since it is slow. */ + __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx &= ~(1 << 9); + } } /* This spells out "AuthenticAMD". */ else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) diff --git a/sysdeps/x86_64/multiarch/rtld-rawmemchr.c b/sysdeps/x86_64/multiarch/rtld-rawmemchr.c new file mode 100644 index 0000000000..53a90675ab --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-rawmemchr.c @@ -0,0 +1 @@ +#include "../rtld-rawmemchr.c" diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S new file mode 100644 index 0000000000..596e0549ea --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strlen.S @@ -0,0 +1 @@ +#include "../rtld-strlen.S" diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index 37985036aa..1a315737af 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -28,9 +28,9 @@ /* calculate left number to compare */ \ lea -16(%rcx, %r11), %r9; \ cmp %r9, %r11; \ - jb LABEL(strcmp_exitz); \ + jb LABEL(strcmp_exitz_sse4_2); \ test %r9, %r9; \ - je LABEL(strcmp_exitz); \ + je LABEL(strcmp_exitz_sse4_2); \ mov %r9, %r11 #define STRCMP_SSE42 __strncmp_sse42 @@ -106,9 +106,9 @@ STRCMP_SSE42: */ #ifdef USE_AS_STRNCMP test %rdx, %rdx - je LABEL(strcmp_exitz) + je LABEL(strcmp_exitz_sse4_2) cmp $1, %rdx - je LABEL(Byte0) + je LABEL(Byte0_sse4_2) mov %rdx, %r11 #endif mov %esi, %ecx @@ -117,23 +117,21 @@ STRCMP_SSE42: and $0x3f, %rcx /* rsi alignment in cache line */ and $0x3f, %rax /* rdi alignment in cache line */ cmp $0x30, %ecx - ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ + ja LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */ cmp $0x30, %eax - ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ - movlpd (%rdi), %xmm1 - movlpd (%rsi), %xmm2 - movhpd 8(%rdi), %xmm1 - movhpd 8(%rsi), %xmm2 + ja LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */ + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ psubb %xmm0, %xmm1 /* packed sub of comparison results*/ pmovmskb %xmm1, %edx sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ - jnz LABEL(less16bytes) /* If not, find different value or null char */ + jnz LABEL(less16bytes_sse4_2)/* If not, find different value or null char */ #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) /* finish comparision */ + jbe LABEL(strcmp_exitz_sse4_2)/* finish comparision */ #endif add $16, %rsi /* prepare to search next 16 bytes */ add $16, %rdi /* prepare to search next 16 bytes */ @@ -144,7 +142,7 @@ STRCMP_SSE42: * below to use. */ .p2align 4 -LABEL(crosscache): +LABEL(crosscache_sse4_2): and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ mov $0xffff, %edx /* for equivalent offset */ @@ -152,15 +150,15 @@ LABEL(crosscache): and $0xf, %ecx /* offset of rsi */ and $0xf, %eax /* offset of rdi */ cmp %eax, %ecx - je LABEL(ashr_0) /* rsi and rdi relative offset same */ - ja LABEL(bigger) + je LABEL(ashr_0_sse4_2) /* rsi and rdi relative offset same */ + ja LABEL(bigger_sse4_2) mov %edx, %r8d /* r8d is offset flag for exit tail */ xchg %ecx, %eax xchg %rsi, %rdi -LABEL(bigger): +LABEL(bigger_sse4_2): lea 15(%rax), %r9 sub %rcx, %r9 - lea LABEL(unaligned_table)(%rip), %r10 + lea LABEL(unaligned_table_sse4_2)(%rip), %r10 movslq (%r10, %r9,4), %r9 lea (%r10, %r9), %r10 jmp *%r10 /* jump to corresponding case */ @@ -171,7 +169,7 @@ LABEL(bigger): * n(0~15) n(0~15) 15(15+ n-n) ashr_0 */ .p2align 4 -LABEL(ashr_0): +LABEL(ashr_0_sse4_2): movdqa (%rsi), %xmm1 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ @@ -186,7 +184,7 @@ LABEL(ashr_0): * edx must be the same with r9d if in left byte (16-rcx) is equal to * the start from (16-rax) and no null char was seen. */ - jne LABEL(less32bytes) /* mismatch or null char */ + jne LABEL(less32bytes_sse4_2) /* mismatch or null char */ UPDATE_STRNCMP_COUNTER mov $16, %rcx mov $16, %r9 @@ -205,7 +203,7 @@ LABEL(ashr_0_use_sse4_2): jbe LABEL(ashr_0_use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif movdqa (%rdi,%rdx), %xmm0 @@ -214,17 +212,17 @@ LABEL(ashr_0_use_sse4_2): jbe LABEL(ashr_0_use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif jmp LABEL(ashr_0_use_sse4_2) .p2align 4 LABEL(ashr_0_use_sse4_2_exit): - jnc LABEL(strcmp_exitz) + jnc LABEL(strcmp_exitz_sse4_2) #ifdef USE_AS_STRNCMP sub %rcx, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif lea -16(%rdx, %rcx), %rcx movzbl (%rdi, %rcx), %eax @@ -241,7 +239,7 @@ LABEL(ashr_0_use_sse4_2_exit): * n(15) n -15 0(15 +(n-15) - n) ashr_1 */ .p2align 4 -LABEL(ashr_1): +LABEL(ashr_1_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -253,7 +251,7 @@ LABEL(ashr_1): shr %cl, %edx /* adjust 0xffff for offset */ shr %cl, %r9d /* adjust for 16-byte offset */ sub %r9d, %edx - jnz LABEL(less32bytes) /* mismatch or null char seen */ + jnz LABEL(less32bytes_sse4_2)/* mismatch or null char seen */ movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -281,7 +279,7 @@ LABEL(loop_ashr_1_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -294,7 +292,7 @@ LABEL(loop_ashr_1_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_1_use_sse4_2) @@ -320,7 +318,7 @@ LABEL(nibble_ashr_1_use_sse4_2): * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 */ .p2align 4 -LABEL(ashr_2): +LABEL(ashr_2_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -332,7 +330,7 @@ LABEL(ashr_2): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -360,7 +358,7 @@ LABEL(loop_ashr_2_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -373,7 +371,7 @@ LABEL(loop_ashr_2_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_2_use_sse4_2) @@ -399,7 +397,7 @@ LABEL(nibble_ashr_2_use_sse4_2): * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 */ .p2align 4 -LABEL(ashr_3): +LABEL(ashr_3_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -411,7 +409,7 @@ LABEL(ashr_3): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -439,7 +437,7 @@ LABEL(loop_ashr_3_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -452,7 +450,7 @@ LABEL(loop_ashr_3_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_3_use_sse4_2) @@ -478,7 +476,7 @@ LABEL(nibble_ashr_3_use_sse4_2): * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 */ .p2align 4 -LABEL(ashr_4): +LABEL(ashr_4_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -490,7 +488,7 @@ LABEL(ashr_4): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -519,7 +517,7 @@ LABEL(loop_ashr_4_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -532,7 +530,7 @@ LABEL(loop_ashr_4_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_4_use_sse4_2) @@ -558,7 +556,7 @@ LABEL(nibble_ashr_4_use_sse4_2): * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 */ .p2align 4 -LABEL(ashr_5): +LABEL(ashr_5_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -570,7 +568,7 @@ LABEL(ashr_5): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -599,7 +597,7 @@ LABEL(loop_ashr_5_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -613,7 +611,7 @@ LABEL(loop_ashr_5_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_5_use_sse4_2) @@ -639,7 +637,7 @@ LABEL(nibble_ashr_5_use_sse4_2): * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 */ .p2align 4 -LABEL(ashr_6): +LABEL(ashr_6_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -651,7 +649,7 @@ LABEL(ashr_6): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -680,7 +678,7 @@ LABEL(loop_ashr_6_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -693,7 +691,7 @@ LABEL(loop_ashr_6_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_6_use_sse4_2) @@ -719,7 +717,7 @@ LABEL(nibble_ashr_6_use_sse4_2): * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 */ .p2align 4 -LABEL(ashr_7): +LABEL(ashr_7_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -731,7 +729,7 @@ LABEL(ashr_7): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -760,7 +758,7 @@ LABEL(loop_ashr_7_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -773,7 +771,7 @@ LABEL(loop_ashr_7_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_7_use_sse4_2) @@ -799,7 +797,7 @@ LABEL(nibble_ashr_7_use_sse4_2): * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 */ .p2align 4 -LABEL(ashr_8): +LABEL(ashr_8_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -811,7 +809,7 @@ LABEL(ashr_8): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -840,7 +838,7 @@ LABEL(loop_ashr_8_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -853,7 +851,7 @@ LABEL(loop_ashr_8_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_8_use_sse4_2) @@ -879,7 +877,7 @@ LABEL(nibble_ashr_8_use_sse4_2): * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 */ .p2align 4 -LABEL(ashr_9): +LABEL(ashr_9_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -891,7 +889,7 @@ LABEL(ashr_9): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -921,7 +919,7 @@ LABEL(loop_ashr_9_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -934,7 +932,7 @@ LABEL(loop_ashr_9_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_9_use_sse4_2) @@ -960,7 +958,7 @@ LABEL(nibble_ashr_9_use_sse4_2): * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 */ .p2align 4 -LABEL(ashr_10): +LABEL(ashr_10_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -972,7 +970,7 @@ LABEL(ashr_10): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -1001,7 +999,7 @@ LABEL(loop_ashr_10_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1014,7 +1012,7 @@ LABEL(loop_ashr_10_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_10_use_sse4_2) @@ -1040,7 +1038,7 @@ LABEL(nibble_ashr_10_use_sse4_2): * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 */ .p2align 4 -LABEL(ashr_11): +LABEL(ashr_11_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -1052,7 +1050,7 @@ LABEL(ashr_11): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -1081,7 +1079,7 @@ LABEL(loop_ashr_11_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1094,7 +1092,7 @@ LABEL(loop_ashr_11_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_11_use_sse4_2) @@ -1120,7 +1118,7 @@ LABEL(nibble_ashr_11_use_sse4_2): * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 */ .p2align 4 -LABEL(ashr_12): +LABEL(ashr_12_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -1132,7 +1130,7 @@ LABEL(ashr_12): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -1161,7 +1159,7 @@ LABEL(loop_ashr_12_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1174,7 +1172,7 @@ LABEL(loop_ashr_12_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_12_use_sse4_2) @@ -1200,7 +1198,7 @@ LABEL(nibble_ashr_12_use_sse4_2): * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 */ .p2align 4 -LABEL(ashr_13): +LABEL(ashr_13_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -1212,7 +1210,7 @@ LABEL(ashr_13): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -1242,7 +1240,7 @@ LABEL(loop_ashr_13_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1255,7 +1253,7 @@ LABEL(loop_ashr_13_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_13_use_sse4_2) @@ -1281,7 +1279,7 @@ LABEL(nibble_ashr_13_use_sse4_2): * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 */ .p2align 4 -LABEL(ashr_14): +LABEL(ashr_14_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -1293,7 +1291,7 @@ LABEL(ashr_14): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 UPDATE_STRNCMP_COUNTER @@ -1323,7 +1321,7 @@ LABEL(loop_ashr_14_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1336,7 +1334,7 @@ LABEL(loop_ashr_14_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_14_use_sse4_2) @@ -1362,7 +1360,7 @@ LABEL(nibble_ashr_14_use_sse4_2): * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 */ .p2align 4 -LABEL(ashr_15): +LABEL(ashr_15_sse4_2): pxor %xmm0, %xmm0 movdqa (%rdi), %xmm2 movdqa (%rsi), %xmm1 @@ -1374,7 +1372,7 @@ LABEL(ashr_15): shr %cl, %edx shr %cl, %r9d sub %r9d, %edx - jnz LABEL(less32bytes) + jnz LABEL(less32bytes_sse4_2) movdqa (%rdi), %xmm3 @@ -1406,7 +1404,7 @@ LABEL(loop_ashr_15_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx @@ -1419,7 +1417,7 @@ LABEL(loop_ashr_15_use_sse4_2): jbe LABEL(use_sse4_2_exit) #ifdef USE_AS_STRNCMP sub $16, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add $16, %rdx jmp LABEL(loop_ashr_15_use_sse4_2) @@ -1441,219 +1439,78 @@ LABEL(nibble_ashr_use_sse4_2_exit): pcmpistri $0x1a,(%rsi,%rdx), %xmm0 .p2align 4 LABEL(use_sse4_2_exit): - jnc LABEL(strcmp_exitz) + jnc LABEL(strcmp_exitz_sse4_2) #ifdef USE_AS_STRNCMP sub %rcx, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif add %rcx, %rdx lea -16(%rdi, %r9), %rdi movzbl (%rdi, %rdx), %eax movzbl (%rsi, %rdx), %edx test %r8d, %r8d - jz LABEL(use_sse4_2_ret) + jz LABEL(use_sse4_2_ret_sse4_2) xchg %eax, %edx -LABEL(use_sse4_2_ret): +LABEL(use_sse4_2_ret_sse4_2): sub %edx, %eax ret - .p2align 4 -LABEL(aftertail): - pcmpeqb %xmm3, %xmm1 - psubb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - not %edx - - .p2align 4 -LABEL(exit): - lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ -LABEL(less32bytes): +LABEL(less32bytes_sse4_2): lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ test %r8d, %r8d - jz LABEL(ret) + jz LABEL(ret_sse4_2) xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ .p2align 4 -LABEL(ret): -LABEL(less16bytes): - /* - * Check to see if BSF is fast on this processor. If not, use a different - * exit tail. - */ +LABEL(ret_sse4_2): +LABEL(less16bytes_sse4_2): bsf %rdx, %rdx /* find and store bit index in %rdx */ #ifdef USE_AS_STRNCMP sub %rdx, %r11 - jbe LABEL(strcmp_exitz) + jbe LABEL(strcmp_exitz_sse4_2) #endif - xor %ecx, %ecx /* clear %ecx */ - xor %eax, %eax /* clear %eax */ - - movb (%rsi, %rdx), %cl - movb (%rdi, %rdx), %al + movzbl (%rsi, %rdx), %ecx + movzbl (%rdi, %rdx), %eax sub %ecx, %eax ret -LABEL(strcmp_exitz): +LABEL(strcmp_exitz_sse4_2): xor %eax, %eax ret .p2align 4 -LABEL(Byte0): - /* - * never need to handle byte 0 for strncmpy -#ifdef USE_AS_STRNCMP - sub $0, %r11 - jbe LABEL(strcmp_exitz) -#endif - */ +LABEL(Byte0_sse4_2): movzx (%rsi), %ecx movzx (%rdi), %eax sub %ecx, %eax ret - - .p2align 4 -LABEL(Byte1): - -#ifdef USE_AS_STRNCMP - sub $1, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 1(%rsi), %ecx - movzx 1(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte2): - -#ifdef USE_AS_STRNCMP - sub $2, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 2(%rsi), %ecx - movzx 2(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte3): - -#ifdef USE_AS_STRNCMP - sub $3, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 3(%rsi), %ecx - movzx 3(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte4): - -#ifdef USE_AS_STRNCMP - sub $4, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 4(%rsi), %ecx - movzx 4(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte5): - -#ifdef USE_AS_STRNCMP - sub $5, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 5(%rsi), %ecx - movzx 5(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(Byte6): - -#ifdef USE_AS_STRNCMP - sub $6, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 6(%rsi), %ecx - movzx 6(%rdi), %eax - - sub %ecx, %eax - ret - - .p2align 4 -LABEL(next_8_bytes): - add $8, %rdi - add $8, %rsi -#ifdef USE_AS_STRNCMP - sub $8, %r11 - jbe LABEL(strcmp_exitz) -#endif - test $0x01, %dh - jnz LABEL(Byte0) - - test $0x02, %dh - jnz LABEL(Byte1) - - test $0x04, %dh - jnz LABEL(Byte2) - - test $0x08, %dh - jnz LABEL(Byte3) - - test $0x10, %dh - jnz LABEL(Byte4) - - test $0x20, %dh - jnz LABEL(Byte5) - - test $0x40, %dh - jnz LABEL(Byte6) - -#ifdef USE_AS_STRNCMP - sub $7, %r11 - jbe LABEL(strcmp_exitz) -#endif - movzx 7(%rsi), %ecx - movzx 7(%rdi), %eax - - sub %ecx, %eax - ret cfi_endproc .size STRCMP_SSE42, .-STRCMP_SSE42 /* Put all SSE 4.2 functions together. */ .section .rodata.sse4.2,"a",@progbits - .p2align 4 -LABEL(unaligned_table): - .int LABEL(ashr_1) - LABEL(unaligned_table) - .int LABEL(ashr_2) - LABEL(unaligned_table) - .int LABEL(ashr_3) - LABEL(unaligned_table) - .int LABEL(ashr_4) - LABEL(unaligned_table) - .int LABEL(ashr_5) - LABEL(unaligned_table) - .int LABEL(ashr_6) - LABEL(unaligned_table) - .int LABEL(ashr_7) - LABEL(unaligned_table) - .int LABEL(ashr_8) - LABEL(unaligned_table) - .int LABEL(ashr_9) - LABEL(unaligned_table) - .int LABEL(ashr_10) - LABEL(unaligned_table) - .int LABEL(ashr_11) - LABEL(unaligned_table) - .int LABEL(ashr_12) - LABEL(unaligned_table) - .int LABEL(ashr_13) - LABEL(unaligned_table) - .int LABEL(ashr_14) - LABEL(unaligned_table) - .int LABEL(ashr_15) - LABEL(unaligned_table) - .int LABEL(ashr_0) - LABEL(unaligned_table) + .p2align 3 +LABEL(unaligned_table_sse4_2): + .int LABEL(ashr_1_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_2_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_3_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_4_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_5_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_6_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_7_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_8_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_9_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_10_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_11_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_12_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_13_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_14_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_15_sse4_2) - LABEL(unaligned_table_sse4_2) + .int LABEL(ashr_0_sse4_2) - LABEL(unaligned_table_sse4_2) # undef ENTRY @@ -1673,6 +1530,4 @@ LABEL(unaligned_table): .globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2 #endif -#ifndef USE_AS_STRNCMP #include "../strcmp.S" -#endif diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S index 25cd01307d..7e400a9140 100644 --- a/sysdeps/x86_64/multiarch/strcpy.S +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -64,16 +64,9 @@ ENTRY(STRCPY) call __init_cpu_features 1: leaq STRCPY_SSE2(%rip), %rax testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) - jz 3f -/* Avoid SSSE3 strcpy on Atom since it is slow. */ - cmpl $1, __cpu_features+KIND_OFFSET(%rip) - jne 2f - cmpl $6, __cpu_features+FAMILY_OFFSET(%rip) - jne 2f - cmpl $28, __cpu_features+MODEL_OFFSET(%rip) - jz 3f -2: leaq STRCPY_SSSE3(%rip), %rax -3: ret + jz 2f + leaq STRCPY_SSSE3(%rip), %rax +2: ret END(STRCPY) .section .text.ssse3,"ax",@progbits diff --git a/sysdeps/x86_64/multiarch/strncmp-c.c b/sysdeps/x86_64/multiarch/strncmp-c.c deleted file mode 100644 index d4f74a418d..0000000000 --- a/sysdeps/x86_64/multiarch/strncmp-c.c +++ /dev/null @@ -1,8 +0,0 @@ -#ifdef SHARED -#define STRNCMP __strncmp_sse2 -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(name) \ - __hidden_ver1 (__strncmp_sse2, __GI_strncmp, __strncmp_sse2); -#endif - -#include "strncmp.c" diff --git a/sysdeps/x86_64/rtld-memchr.c b/sysdeps/x86_64/rtld-memchr.c new file mode 100644 index 0000000000..f63fefbcec --- /dev/null +++ b/sysdeps/x86_64/rtld-memchr.c @@ -0,0 +1 @@ +#include <string/memchr.c> diff --git a/sysdeps/x86_64/rtld-memcmp.c b/sysdeps/x86_64/rtld-memcmp.c new file mode 100644 index 0000000000..2ee40328b8 --- /dev/null +++ b/sysdeps/x86_64/rtld-memcmp.c @@ -0,0 +1 @@ +#include <string/memcmp.c> diff --git a/sysdeps/x86_64/rtld-rawmemchr.c b/sysdeps/x86_64/rtld-rawmemchr.c new file mode 100644 index 0000000000..2b9189393c --- /dev/null +++ b/sysdeps/x86_64/rtld-rawmemchr.c @@ -0,0 +1 @@ +#include <string/rawmemchr.c> diff --git a/sysdeps/x86_64/rtld-strchr.S b/sysdeps/x86_64/rtld-strchr.S new file mode 100644 index 0000000000..8934697972 --- /dev/null +++ b/sysdeps/x86_64/rtld-strchr.S @@ -0,0 +1,291 @@ +/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. + For AMD x86-64. + Copyright (C) 2002, 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + + + .text +ENTRY (BP_SYM (strchr)) + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 64-bit memory access is faster + and (more important) + 2. we process in the main loop 64 bit in one step although + we don't know the end of the string. But accessing at + 8-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherent + boundaries are multiples of 8). */ + + movq %rdi, %rdx + andl $7, %edx /* Mask alignment bits */ + movq %rdi, %rax /* duplicate destination. */ + jz 1f /* aligned => start loop */ + neg %edx + addl $8, %edx /* Align to 8 bytes. */ + + /* Search the first bytes directly. */ +0: movb (%rax), %cl /* load byte */ + cmpb %cl,%sil /* compare byte. */ + je 6f /* target found */ + testb %cl,%cl /* is byte NUL? */ + je 7f /* yes => return NULL */ + incq %rax /* increment pointer */ + decl %edx + jnz 0b + + +1: + /* At the moment %rsi contains C. What we need for the + algorithm is C in all bytes of the register. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + /* Populate 8 bit data to full 64-bit. */ + movabs $0x0101010101010101,%r9 + movzbl %sil,%edx + imul %rdx,%r9 + + movq $0xfefefefefefefeff, %r8 /* Save magic. */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of QUARDWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24 tec.. If one of bits 54-63 is set, there will be a carry + into bit 64 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + .p2align 4 +4: + /* Main Loop is unrolled 4 times. */ + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 7f /* found NUL => return NULL */ + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + xorq %r9, %rcx /* XOR with qword c|...|c => bytes of str == c + are now 0 */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found c => return pointer */ + + /* The quadword we looked at does not contain the value we're looking + for. Let's search now whether we have reached the end of the + string. */ + xorq %r9, %rcx /* restore original dword without reload */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 7f /* highest byte is NUL => return NULL */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => restart loop */ + + +7: /* Return NULL. */ + xorl %eax, %eax + retq + + + /* We now scan for the byte in which the character was matched. + But we have to take care of the case that a NUL char is + found before this in the dword. Note that we XORed %rcx + with the byte we're looking for, therefore the tests below look + reversed. */ + + + .p2align 4 /* Align, it's a jump target. */ +3: movq %r9,%rdx /* move to %rdx so that we can access bytes */ + subq $8,%rax /* correct pointer increment. */ + testb %cl, %cl /* is first byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is first byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is second byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is third byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is third byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is fourth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is fourth byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is fifth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is fifth byte NUL? */ + je 7b /* yes => return NULL */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is sixth byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %ch /* is sixth byte NUL? */ + je 7b /* yes => return NULL? */ + incq %rax /* increment pointer */ + + shrq $16, %rcx /* make upper bytes accessible */ + testb %cl, %cl /* is seventh byte C? */ + jz 6f /* yes => return pointer */ + cmpb %dl, %cl /* is seventh byte NUL? */ + je 7b /* yes => return NULL */ + + /* It must be in the eigth byte and it cannot be NUL. */ + incq %rax + +6: + nop + retq +END (BP_SYM (strchr)) + +weak_alias (BP_SYM (strchr), BP_SYM (index)) +libc_hidden_builtin_def (strchr) diff --git a/sysdeps/x86_64/rtld-strcmp.S b/sysdeps/x86_64/rtld-strcmp.S new file mode 100644 index 0000000000..a25535c161 --- /dev/null +++ b/sysdeps/x86_64/rtld-strcmp.S @@ -0,0 +1,28 @@ +#include <sysdep.h> +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + + .text +ENTRY (BP_SYM (STRCMP)) +/* Simple version since we can't use SSE registers in ld.so. */ +L(oop): movb (%rdi), %al + cmpb (%rsi), %al + jne L(neq) + incq %rdi + incq %rsi + testb %al, %al + jnz L(oop) + + xorl %eax, %eax + ret + +L(neq): movl $1, %eax + movl $-1, %ecx + cmovbl %ecx, %eax + ret +END (BP_SYM (STRCMP)) diff --git a/sysdeps/x86_64/rtld-strlen.S b/sysdeps/x86_64/rtld-strlen.S new file mode 100644 index 0000000000..fd950edaaa --- /dev/null +++ b/sysdeps/x86_64/rtld-strlen.S @@ -0,0 +1,139 @@ +/* strlen(str) -- determine the length of the string STR. + Copyright (C) 2002, 2003 Free Software Foundation, Inc. + Based on i486 version contributed by Ulrich Drepper <drepper@redhat.com>. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include "asm-syntax.h" +#include "bp-sym.h" +#include "bp-asm.h" + + + .text +ENTRY (strlen) + movq %rdi, %rcx /* Duplicate source pointer. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rax /* duplicate destination. */ + jz 1f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: cmpb $0x0,(%rax) /* is byte NUL? */ + je 2f /* yes => return */ + incq %rax /* increment pointer */ + decl %ecx + jnz 0b + +1: movq $0xfefefefefefefeff,%r8 /* Save magic. */ + + .p2align 4 /* Align loop. */ +4: /* Main Loop is unrolled 4 times. */ + /* First unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Second unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Third unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz 3f /* found NUL => return pointer */ + + /* Fourth unroll. */ + movq (%rax), %rcx /* get double word (= 8 bytes) in question */ + addq $8,%rax /* adjust pointer for next word */ + movq %r8, %rdx /* magic value */ + addq %rcx, %rdx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rcx, %rdx /* (word+magic)^word */ + orq %r8, %rdx /* set all non-carry bits */ + incq %rdx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz 4b /* no NUL found => continue loop */ + + .p2align 4 /* Align, it's a jump target. */ +3: subq $8,%rax /* correct pointer increment. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0x00ff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + testl $0xff000000, %ecx /* is fourth byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ + + shrq $32, %rcx /* look at other half. */ + + testb %cl, %cl /* is first byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz 2f /* yes => return */ + incq %rax /* increment pointer */ + + testl $0xff0000, %ecx /* is third byte NUL? */ + jz 2f /* yes => return pointer */ + incq %rax /* increment pointer */ +2: + subq %rdi, %rax /* compute difference to string start */ + ret +END (strlen) +libc_hidden_builtin_def (strlen) diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index 119b88e40b..340a64ba35 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -1,8 +1,10 @@ /* Highly optimized version for x86-64. - Copyright (C) 1999, 2000, 2002, 2003, 2005 Free Software Foundation, Inc. + Copyright (C) 1999, 2000, 2002, 2003, 2005, 2009 + Free Software Foundation, Inc. This file is part of the GNU C Library. Based on i686 version contributed by Ulrich Drepper <drepper@cygnus.com>, 1999. + Updated with SSE2 support contributed by Intel Corporation. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public @@ -24,8 +26,35 @@ #include "bp-sym.h" #include "bp-asm.h" - .text -ENTRY (BP_SYM (strcmp)) +#undef UPDATE_STRNCMP_COUNTER + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +#ifdef USE_AS_STRNCMP +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +#else +# define UPDATE_STRNCMP_COUNTER +# ifndef STRCMP +# define STRCMP strcmp +# endif +#endif + + .text +ENTRY (BP_SYM (STRCMP)) +#ifdef NOT_IN_libc +/* Simple version since we can't use SSE registers in ld.so. */ L(oop): movb (%rdi), %al cmpb (%rsi), %al jne L(neq) @@ -41,5 +70,1914 @@ L(neq): movl $1, %eax movl $-1, %ecx cmovbl %ecx, %eax ret -END (BP_SYM (strcmp)) -libc_hidden_builtin_def (strcmp) +END (BP_SYM (STRCMP)) +#else /* NOT_IN_libc */ +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCMP + test %rdx, %rdx + je LABEL(strcmp_exitz) + cmp $1, %rdx + je LABEL(Byte0) + mov %rdx, %r11 +#endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ + movlpd (%rdi), %xmm1 + movlpd (%rsi), %xmm2 + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes) /* If not, find different value or null char */ +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) /* finish comparision */ +#endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte alignment. + * Use relative offset difference between the two to determine which case + * below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + lea 15(%rax), %r9 + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + lea (%r10, %r9), %r10 + jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +LABEL(ashr_0): + + movdqa (%rsi), %xmm1 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + .p2align 4 +LABEL(loop_ashr_0): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) /* mismatch or null char seen */ + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + jmp LABEL(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +LABEL(ashr_1): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pslldq $15, %xmm2 /* shift first string to align with second */ + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_1): + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + +LABEL(gobble_ashr_1): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_1) + + /* + * Nibble avoids loads across page boundary. This is to avoid a potential + * access into unmapped memory. + */ + .p2align 4 +LABEL(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ + pmovmskb %xmm0, %edx + test $0xfffe, %edx + jnz LABEL(ashr_1_exittail) /* find null char*/ + +#ifdef USE_AS_STRNCMP + cmp $14, %r11 + jbe LABEL(ashr_1_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* substract 4K from %r10 */ + jmp LABEL(gobble_ashr_1) + + /* + * Once find null char, determine if there is a string mismatch + * before the null char. + */ + .p2align 4 +LABEL(ashr_1_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +LABEL(ashr_2): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_2): + add $16, %r10 + jg LABEL(nibble_ashr_2) + +LABEL(gobble_ashr_2): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_2) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_2) + + .p2align 4 +LABEL(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfffc, %edx + jnz LABEL(ashr_2_exittail) + +#ifdef USE_AS_STRNCMP + cmp $13, %r11 + jbe LABEL(ashr_2_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_2) + + .p2align 4 +LABEL(ashr_2_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +LABEL(ashr_3): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_3): + add $16, %r10 + jg LABEL(nibble_ashr_3) + +LABEL(gobble_ashr_3): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $3, %xmm3 + pslldq $13, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_3) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $3, %xmm3 + pslldq $13, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_3) + + .p2align 4 +LABEL(nibble_ashr_3): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfff8, %edx + jnz LABEL(ashr_3_exittail) + +#ifdef USE_AS_STRNCMP + cmp $12, %r11 + jbe LABEL(ashr_3_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_3) + + .p2align 4 +LABEL(ashr_3_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $3, %xmm0 + psrldq $3, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +LABEL(ashr_4): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_4): + add $16, %r10 + jg LABEL(nibble_ashr_4) + +LABEL(gobble_ashr_4): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $4, %xmm3 + pslldq $12, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_4) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $4, %xmm3 + pslldq $12, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_4) + + .p2align 4 +LABEL(nibble_ashr_4): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfff0, %edx + jnz LABEL(ashr_4_exittail) + +#ifdef USE_AS_STRNCMP + cmp $11, %r11 + jbe LABEL(ashr_4_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_4) + + .p2align 4 +LABEL(ashr_4_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +LABEL(ashr_5): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_5): + add $16, %r10 + jg LABEL(nibble_ashr_5) + +LABEL(gobble_ashr_5): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $5, %xmm3 + pslldq $11, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_5) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $5, %xmm3 + pslldq $11, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_5) + + .p2align 4 +LABEL(nibble_ashr_5): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xffe0, %edx + jnz LABEL(ashr_5_exittail) + +#ifdef USE_AS_STRNCMP + cmp $10, %r11 + jbe LABEL(ashr_5_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_5) + + .p2align 4 +LABEL(ashr_5_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $5, %xmm0 + psrldq $5, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + */ + .p2align 4 +LABEL(ashr_6): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_6): + add $16, %r10 + jg LABEL(nibble_ashr_6) + +LABEL(gobble_ashr_6): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $6, %xmm3 + pslldq $10, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_6) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $6, %xmm3 + pslldq $10, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_6) + + .p2align 4 +LABEL(nibble_ashr_6): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xffc0, %edx + jnz LABEL(ashr_6_exittail) + +#ifdef USE_AS_STRNCMP + cmp $9, %r11 + jbe LABEL(ashr_6_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_6) + + .p2align 4 +LABEL(ashr_6_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $6, %xmm0 + psrldq $6, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + */ + .p2align 4 +LABEL(ashr_7): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_7): + add $16, %r10 + jg LABEL(nibble_ashr_7) + +LABEL(gobble_ashr_7): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $7, %xmm3 + pslldq $9, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_7) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $7, %xmm3 + pslldq $9, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_7) + + .p2align 4 +LABEL(nibble_ashr_7): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xff80, %edx + jnz LABEL(ashr_7_exittail) + +#ifdef USE_AS_STRNCMP + cmp $8, %r11 + jbe LABEL(ashr_7_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_7) + + .p2align 4 +LABEL(ashr_7_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $7, %xmm0 + psrldq $7, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + */ + .p2align 4 +LABEL(ashr_8): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_8): + add $16, %r10 + jg LABEL(nibble_ashr_8) + +LABEL(gobble_ashr_8): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $8, %xmm3 + pslldq $8, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_8) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $8, %xmm3 + pslldq $8, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_8) + + .p2align 4 +LABEL(nibble_ashr_8): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xff00, %edx + jnz LABEL(ashr_8_exittail) + +#ifdef USE_AS_STRNCMP + cmp $7, %r11 + jbe LABEL(ashr_8_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_8) + + .p2align 4 +LABEL(ashr_8_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $8, %xmm0 + psrldq $8, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + */ + .p2align 4 +LABEL(ashr_9): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_9): + add $16, %r10 + jg LABEL(nibble_ashr_9) + +LABEL(gobble_ashr_9): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $9, %xmm3 + pslldq $7, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_9) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $9, %xmm3 + pslldq $7, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 /* store for next cycle */ + jmp LABEL(loop_ashr_9) + + .p2align 4 +LABEL(nibble_ashr_9): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfe00, %edx + jnz LABEL(ashr_9_exittail) + +#ifdef USE_AS_STRNCMP + cmp $6, %r11 + jbe LABEL(ashr_9_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_9) + + .p2align 4 +LABEL(ashr_9_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $9, %xmm0 + psrldq $9, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + */ + .p2align 4 +LABEL(ashr_10): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_10): + add $16, %r10 + jg LABEL(nibble_ashr_10) + +LABEL(gobble_ashr_10): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $10, %xmm3 + pslldq $6, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_10) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $10, %xmm3 + pslldq $6, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_10) + + .p2align 4 +LABEL(nibble_ashr_10): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfc00, %edx + jnz LABEL(ashr_10_exittail) + +#ifdef USE_AS_STRNCMP + cmp $5, %r11 + jbe LABEL(ashr_10_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_10) + + .p2align 4 +LABEL(ashr_10_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $10, %xmm0 + psrldq $10, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + */ + .p2align 4 +LABEL(ashr_11): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_11): + add $16, %r10 + jg LABEL(nibble_ashr_11) + +LABEL(gobble_ashr_11): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $11, %xmm3 + pslldq $5, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_11) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $11, %xmm3 + pslldq $5, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_11) + + .p2align 4 +LABEL(nibble_ashr_11): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xf800, %edx + jnz LABEL(ashr_11_exittail) + +#ifdef USE_AS_STRNCMP + cmp $4, %r11 + jbe LABEL(ashr_11_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_11) + + .p2align 4 +LABEL(ashr_11_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $11, %xmm0 + psrldq $11, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + */ + .p2align 4 +LABEL(ashr_12): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_12): + add $16, %r10 + jg LABEL(nibble_ashr_12) + +LABEL(gobble_ashr_12): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $12, %xmm3 + pslldq $4, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_12) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $12, %xmm3 + pslldq $4, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_12) + + .p2align 4 +LABEL(nibble_ashr_12): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xf000, %edx + jnz LABEL(ashr_12_exittail) + +#ifdef USE_AS_STRNCMP + cmp $3, %r11 + jbe LABEL(ashr_12_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_12) + + .p2align 4 +LABEL(ashr_12_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $12, %xmm0 + psrldq $12, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + */ + .p2align 4 +LABEL(ashr_13): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_13): + add $16, %r10 + jg LABEL(nibble_ashr_13) + +LABEL(gobble_ashr_13): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $13, %xmm3 + pslldq $3, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_13) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $13, %xmm3 + pslldq $3, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_13) + + .p2align 4 +LABEL(nibble_ashr_13): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xe000, %edx + jnz LABEL(ashr_13_exittail) + +#ifdef USE_AS_STRNCMP + cmp $2, %r11 + jbe LABEL(ashr_13_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_13) + + .p2align 4 +LABEL(ashr_13_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $13, %xmm0 + psrldq $13, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + */ + .p2align 4 +LABEL(ashr_14): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_14): + add $16, %r10 + jg LABEL(nibble_ashr_14) + +LABEL(gobble_ashr_14): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $14, %xmm3 + pslldq $2, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_14) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $14, %xmm3 + pslldq $2, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_14) + + .p2align 4 +LABEL(nibble_ashr_14): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xc000, %edx + jnz LABEL(ashr_14_exittail) + +#ifdef USE_AS_STRNCMP + cmp $1, %r11 + jbe LABEL(ashr_14_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_14) + + .p2align 4 +LABEL(ashr_14_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $14, %xmm0 + psrldq $14, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + */ + .p2align 4 +LABEL(ashr_15): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_15): + add $16, %r10 + jg LABEL(nibble_ashr_15) + +LABEL(gobble_ashr_15): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $15, %xmm3 + pslldq $1, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_15) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $15, %xmm3 + pslldq $1, %xmm2 + por %xmm3, %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe LABEL(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_15) + + .p2align 4 +LABEL(nibble_ashr_15): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0x8000, %edx + jnz LABEL(ashr_15_exittail) + +#ifdef USE_AS_STRNCMP + test %r11, %r11 + je LABEL(ashr_15_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_15) + + .p2align 4 +LABEL(ashr_15_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $15, %xmm3 + psrldq $15, %xmm0 + + .p2align 4 +LABEL(aftertail): + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + not %edx + + .p2align 4 +LABEL(exit): + lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ +LABEL(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz LABEL(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +LABEL(ret): +LABEL(less16bytes): + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +#ifdef USE_AS_STRNCMP + sub %rdx, %r11 + jbe LABEL(strcmp_exitz) +#endif + movzbl (%rsi, %rdx), %ecx + movzbl (%rdi, %rdx), %eax + + sub %ecx, %eax + ret + +LABEL(strcmp_exitz): + xor %eax, %eax + ret + + .p2align 4 +LABEL(Byte0): + movzx (%rsi), %ecx + movzx (%rdi), %eax + + sub %ecx, %eax + ret +END (BP_SYM (STRCMP)) + + .section .rodata,"a",@progbits + .p2align 3 +LABEL(unaligned_table): + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + .int LABEL(ashr_0) - LABEL(unaligned_table) +#endif /* NOT_IN_libc */ +libc_hidden_builtin_def (STRCMP) diff --git a/sysdeps/x86_64/strncmp.S b/sysdeps/x86_64/strncmp.S new file mode 100644 index 0000000000..0af34e7f15 --- /dev/null +++ b/sysdeps/x86_64/strncmp.S @@ -0,0 +1,3 @@ +#define STRCMP strncmp +#define USE_AS_STRNCMP +#include "strcmp.S" diff --git a/sysdeps/x86_64/tst-xmmymm.sh b/sysdeps/x86_64/tst-xmmymm.sh new file mode 100755 index 0000000000..0735276e6d --- /dev/null +++ b/sysdeps/x86_64/tst-xmmymm.sh @@ -0,0 +1,17 @@ +#! /bin/sh +objpfx="$1" + +tmp=$(mktemp ${objpfx}tst-xmmymm.XXXXXX) +trap 'rm -f "$tmp"' 1 2 3 15 + +objdump -d "${objpfx}ld.so" | +awk 'BEGIN { last="" } /^[[:xdigit:]]* <[_[:alnum:]]*>:$/ { fct=substr($2, 2, length($2)-3) } /,%[xy]mm[[:digit:]]*$/ { if (last != fct) { print fct; last=fct} }' | +tee "$tmp" + +echo "Functions which incorrectly modify xmm/ymm registers:" +err=1 +egrep -vs '^_dl_runtime_profile$' "$tmp" || err=0 +if test $err -eq 0; then echo "None"; fi + +rm "$tmp" +exit $err |