diff options
author | Andreas Schwab <schwab@redhat.com> | 2009-07-20 11:02:11 +0200 |
---|---|---|
committer | Andreas Schwab <schwab@redhat.com> | 2009-07-20 11:02:11 +0200 |
commit | 53924a77a2b827e7f9af6424a6a30224d09692d1 (patch) | |
tree | ba5d034a512524339fcfed113518eb83201fdc23 | |
parent | 8ecde8e8c2a8e77804f954afffd9efe0ab951e52 (diff) | |
parent | 42e69bcf1137fccfd7a95645a9d316c6490b9ff9 (diff) | |
download | glibc-53924a77a2b827e7f9af6424a6a30224d09692d1.tar.gz glibc-53924a77a2b827e7f9af6424a6a30224d09692d1.tar.xz glibc-53924a77a2b827e7f9af6424a6a30224d09692d1.zip |
Merge commit 'origin/master' into fedora/master
78 files changed, 6487 insertions, 1236 deletions
diff --git a/ChangeLog b/ChangeLog index 69edfd237e..9ab7a42c63 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,204 @@ +2009-07-17 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/generic/sysdep.h: Define cfi_personality, cfi_lsda, + CFI_PERSONALITY, CFI_LSDA, and DW_EH_PE_* constants. + +2009-07-16 Ulrich Drepper <drepper@redhat.com> + + [BZ #10360] + * resolv/res-mkquery.c (__res_nopt): If anslen is > 0xffff store + 0xffff in the EDNS0 record. + +2009-07-16 Ulrich Drepper <drepper@redhat.com> + + * nscd/cache.c (cache_add): Use atomic_compare_and_exchange_bool_rel + instead of atomic_compare_and_exchange_bool_acq to ensure pointer + is written before the list head update. + Patch by Andreas Schwab <aschwab@redhat.com>. + +2009-07-16 Ulrich Drepper <drepper@redhat.com> + Jakub Jelinek <jakub@redhat.com> + + * malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Make check for + corruption thread-safe. + +2009-07-13 Jakub Jelinek <jakub@redhat.com> + + * include/atomic.h (catomic_compare_and_exchange_val_rel): If arch + overrides atomic_compare_and_exchange_val_rel, define to + atomic_compare_and_exchange_val_rel by default, otherwise default + to catomic_compare_and_exchange_val_acq. + (catomic_compare_and_exchange_bool_rel): If arch overrides + atomic_compare_and_exchange_bool_rel, define to + atomic_compare_and_exchange_bool_rel by default. + * malloc/malloc.c (_int_free): Revert 2009-07-02 change. + Use catomic_compare_and_exchange_val_rel instead of + catomic_compare_and_exchange_val_acq. + +2009-07-16 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/generic/ldsodefs.h: Add prototype for + _dl_higher_prime_number. + * elf/dl-misc.c (_dl_higher_prime_number): Mark with internal_function. + + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Optimize + restoring of ymm registers a bit. + +2009-07-15 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/x86_64/memcmp.S: New file. + +2009-07-15 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86-64/dl-trampoline.h: Remove after integrating code into... + * sysdeps/x86-64/dl-trampoline.S: ...here. Rewrite to avoid function + pointers in writable memory. + +2009-07-07 H.J. Lu <hongjiu.lu@intel.com> + + * config.h.in: Add HAVE_AVX_SUPPORT entry. + * config.make.in: Add config-cflags-avx entry. + * configure.in: Substitute libc_cv_cc_avx. + * elf/Makefile: Add rules to build and run tst-audit4 and tst-audit5. + * elf/tst-audit4.c: New file. + * elf/tst-audit5.c: New file. + * elf/tst-auditmod4a.c: New file. + * elf/tst-auditmod4b.c: New file. + * elf/tst-auditmod5a.c: New file. + * elf/tst-auditmod5b.c: New file. + * sysdeps/x86_64/Makefile (gen-as-const-headers): Add + link-defines.sym. + * sysdeps/x86_64/bits/link.h (La_x86_64_ymm): New. + (La_x86_64_vector): Likewise. + (La_x86_64_regs): Append lr_vector. + (La_x86_64_retval): Append lr_vector0/lrv_vector1. + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Move + saving and restoring SSE registers to ... + * sysdeps/x86_64/dl-trampoline.h: This. New file. + * sysdeps/x86_64/dl-trampoline.S: Include <config.h> and + <link-defines.h>. + (_dl_runtime_profile): Use LR_SIZE to allocate space for + La_x86_64_regs. Allocate extra space and jump to memory at + save_and_restore_vector if HAVE_AVX_SUPPORT is defined. + (save_and_restore_vector_sse): New. + (save_and_restore_vector_avx): Likewise. + (check_avx): Likewise. + (save_and_restore_vector): Likewise. + * sysdeps/x86_64/elf/configure.in: Set libc_cv_cc_avx and + HAVE_AVX_SUPPORT. + * sysdeps/x86_64/link-defines.sym: New file. + +2009-07-10 Ulrich Drepper <drepper@redhat.com> + + * elf/do-lookup.h: Removed after folding content into... + * elf/dl-lookup.c: ...here. + + * sysdeps/unix/sysv/linux/sys/epoll.h: Fix comment. + +2009-07-09 Ulrich Drepper <drepper@redhat.com> + + * configure.in: Check for gnu_unique_symbol symbol type. + * config.h.in: Add HAVE_ASM_UNIQUE_OBJECT entry. + * elf/do-lookup.h (do_lookup_x): Take new parameter with link map of + the undefined symbol. Handle STB_GNU_UNIQUE binding of found symbol. + * elf/dl-lookup.c (_dl_lookup_symbol_x): Adjust callers for do_lookup_x + change. + * sysdeps/generic/ldsodefs.h (struct rtld_global): Add definitions for + unique symbol table. + * elf/rtld.c (rtld_global): Initialize lock of unique symbol hash table + for first namespace. + * elf/dl-open.c (_dl_open): For new namespace, initialize lock for + unique symbol hash table. + * elf/Makefile: Add rules to build and run tst-unique1 and tst-unique2. + * elf/tst-unique1.c: New file. + * elf/tst-unique1mod1.c: New file. + * elf/tst-unique1mod2.c: New file. + * elf/tst-unique2.c: New file. + * elf/tst-unique2mod1.c: New file. + * elf/tst-unique2mod2.c: New file. + +2009-07-07 Ulrich Drepper <drepper@redhat.com> + + * elf/elf.h (STB_GNU_UNIQUE): Define. + + * elf/dl-misc.c (_dl_higher_prime_number): New function. Moved here + from... + * include/inline-hashtab.h: ...here. + (htab_expand): Adjust for renamed function. Correct memory handling. + +2009-07-06 Ulrich Drepper <drepper@redhat.com> + + * elf/do-lookup.h (do_lookup_x): Optimize test for valid symbol types. + +2009-07-03 Andreas Schwab <aschwab@redhat.com> + + * sysdeps/powerpc/sysdep.h (PPC_FEATURE_ARCH_2_06): Fix value. + (PPC_FEATURE_HAS_VSX): Likewise. + +2009-07-03 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/x86_64/multiarch/strcspn-c.c: Minor cleanups. + * sysdeps/x86_64/multiarch/strspn-c.c: Likewise. + + * sysdeps/x86_64/multiarch/strcmp.S: Make sure functions are all + aligned to 16 byte boundaries. + * sysdeps/x86_64/multiarch/strcpy.S: Likewise. + * sysdeps/x86_64/multiarch/strlen.S: Likewise. + * sysdeps/x86_64/multiarch/rawmemchr.S: Likewise. + +2009-07-02 H.J. Lu <hongjiu.lu@intel.com> + + * config.h.in (HAVE_SSE4_SUPPORT): New macro. + * config.make.in (config-cflags-sse4): New variable. + * configure.in: Substitute libc_cv_cc_sse4. + * sysdeps/i386/configure.in: Set libc_cv_cc_sse4 and + HAVE_SSE4_SUPPORT. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + strcspn-c, strpbrk-c, strspn-c for string if gcc supports SSE4. + * sysdeps/x86_64/multiarch/strcspn-c.c: New file. + * sysdeps/x86_64/multiarch/strcspn.S: New file. + * sysdeps/x86_64/multiarch/strpbrk-c.c: New file. + * sysdeps/x86_64/multiarch/strpbrk.S: New file. + * sysdeps/x86_64/multiarch/strspn-c.c: New file. + * sysdeps/x86_64/multiarch/strspn.S: New file. + +2009-06-30 H.J. Lu <hongjiu.lu@intel.com> + + * elf/Makefile (distribute): Remove tst-audit.sh. Add + tst-audit2.c, tst-audit3.c, tst-auditmod3a.c, tst-auditmod3b.c. + (tests): Add tst-audit3 for x86_64. + (modules-names): Add tst-auditmod3a, tst-auditmod3b. + ($(objpfx)tst-audit3): Define. + ($(objpfx)tst-audit3.out): Define. + (tst-audit3-ENV): Define. + * elf/tst-audit3.c: New file. + * elf/tst-auditmod3a.c: New file. + * elf/tst-auditmod3b.c: New file. + * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Save + and restore xmm6. + + * string/stpncpy.c (STPNCPY): New. Defined if not defined. + (__stpncpy): Renamed to ... + (STPNCPY): This. + (stpncpy): Create alias only if STPNCPY is not defined. + * string/strncpy.c (STRNCPY): New. Defined to strncpy if not + defined. + (strncpy): Renamed to ... + (STRNCPY): This. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + stpncpy-c strncpy-c for string. + * sysdeps/x86_64/multiarch/stpcpy.S: New file. + * sysdeps/x86_64/multiarch/stpncpy-c.c: New file. + * sysdeps/x86_64/multiarch/stpncpy.S: New file. + * sysdeps/x86_64/multiarch/strcpy.S: New file. + * sysdeps/x86_64/multiarch/strncpy-c.c: New file. + * sysdeps/x86_64/multiarch/strncpy.S: New file. + +2009-07-02 Ulrich Drepper <drepper@redhat.com> + + * malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Add full barrier when + adding to fast bin list. + 2009-07-01 Ulrich Drepper <drepper@redhat.com> * nis/nss_nis/nis-network.c (_nss_nis_getnetbyaddr_r): Don't use diff --git a/config.h.in b/config.h.in index 8dbc224a7d..18bf01a38c 100644 --- a/config.h.in +++ b/config.h.in @@ -59,6 +59,9 @@ assembler's `.type' directive, if it has one. */ #undef ASM_TYPE_DIRECTIVE_PREFIX +/* Define if the assembler supports the gnu_unique_object symbol type. */ +#undef HAVE_ASM_UNIQUE_OBJECT + /* Define a symbol_name as a global .symbol_name for ld. */ #undef HAVE_ASM_GLOBAL_DOT_NAME @@ -129,6 +132,12 @@ /* Define if binutils support TLS handling. */ #undef HAVE_TLS_SUPPORT +/* Define if gcc supports SSE4. */ +#undef HAVE_SSE4_SUPPORT + +/* Define if gcc supports AVX. */ +#undef HAVE_AVX_SUPPORT + /* Define if the compiler's exception support is based on libunwind. */ #undef HAVE_CC_WITH_LIBUNWIND diff --git a/config.make.in b/config.make.in index e48ea2658d..d65706ceac 100644 --- a/config.make.in +++ b/config.make.in @@ -34,6 +34,9 @@ config-sysdirs = @sysnames@ cflags-cpu = @libc_cv_cc_submachine@ asflags-cpu = @libc_cv_cc_submachine@ +config-cflags-sse4 = @libc_cv_cc_sse4@ +config-cflags-avx = @libc_cv_cc_avx@ + defines = @DEFINES@ sysincludes = @SYSINCLUDES@ c++-sysincludes = @CXX_SYSINCLUDES@ diff --git a/configure b/configure index 88cf4fd853..48e6952bbd 100755 --- a/configure +++ b/configure @@ -657,6 +657,8 @@ xcoff elf ldd_rewrite_script use_ldconfig +libc_cv_cc_avx +libc_cv_cc_sse4 libc_cv_cpp_asm_debuginfo libc_cv_forced_unwind libc_cv_rootsbindir @@ -5993,6 +5995,32 @@ _ACEOF fi +{ $as_echo "$as_me:$LINENO: checking for assembler gnu_unique_object symbol type" >&5 +$as_echo_n "checking for assembler gnu_unique_object symbol type... " >&6; } +if test "${libc_cv_asm_unique_object+set}" = set; then + $as_echo_n "(cached) " >&6 +else + cat > conftest.s <<EOF +${libc_cv_dot_text} +_sym: +.type _sym, ${libc_cv_asm_type_prefix}gnu_unique_object +EOF +if ${CC-cc} -c $ASFLAGS conftest.s 1>&5 2>&5; then + libc_cv_asm_unique_object=yes +else + libc_cv_asm_unique_object=no +fi +rm -f conftest* +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_asm_unique_object" >&5 +$as_echo "$libc_cv_asm_unique_object" >&6; } +if test $libc_cv_asm_unique_object = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_ASM_UNIQUE_OBJECT 1 +_ACEOF + +fi + # For the multi-arch option we need support in the assembler. if test "$multi_arch" = yes; then if test "x$libc_cv_asm_type_prefix" != xno; then @@ -8744,6 +8772,8 @@ fi + + if test $elf = yes; then cat >>confdefs.h <<\_ACEOF #define HAVE_ELF 1 diff --git a/configure.in b/configure.in index 6a92bd876a..4584afe605 100644 --- a/configure.in +++ b/configure.in @@ -1211,6 +1211,23 @@ if test "x$libc_cv_asm_type_prefix" != xno; then AC_DEFINE_UNQUOTED(ASM_TYPE_DIRECTIVE_PREFIX, ${libc_cv_asm_type_prefix}) fi +AC_CACHE_CHECK(for assembler gnu_unique_object symbol type, + libc_cv_asm_unique_object, [dnl +cat > conftest.s <<EOF +${libc_cv_dot_text} +_sym: +.type _sym, ${libc_cv_asm_type_prefix}gnu_unique_object +EOF +if ${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD; then + libc_cv_asm_unique_object=yes +else + libc_cv_asm_unique_object=no +fi +rm -f conftest*]) +if test $libc_cv_asm_unique_object = yes; then + AC_DEFINE(HAVE_ASM_UNIQUE_OBJECT) +fi + # For the multi-arch option we need support in the assembler. if test "$multi_arch" = yes; then if test "x$libc_cv_asm_type_prefix" != xno; then @@ -2259,6 +2276,8 @@ AC_SUBST(libc_cv_forced_unwind) dnl sysdeps/CPU/configure.in checks set this via arch-specific asm tests AC_SUBST(libc_cv_cpp_asm_debuginfo) +AC_SUBST(libc_cv_cc_sse4) +AC_SUBST(libc_cv_cc_avx) AC_SUBST(use_ldconfig) AC_SUBST(ldd_rewrite_script) diff --git a/elf/Makefile b/elf/Makefile index 56935d5a1a..21d131ec92 100644 --- a/elf/Makefile +++ b/elf/Makefile @@ -47,7 +47,7 @@ distribute := rtld-Rules \ dl-cache.h dl-hash.h soinit.c sofini.c ldd.bash.in \ genrtldtbl.awk atomicity.h dl-procinfo.h ldsodefs.h \ dl-librecon.h interp.c sln.c dl-dst.h hp-timing.h \ - do-lookup.h dl-lookupcfg.h sprof.c gen-trusted-dirs.awk \ + dl-lookupcfg.h sprof.c gen-trusted-dirs.awk \ testobj1.c testobj2.c testobj3.c testobj4.c testobj5.c \ testobj6.c testobj1_1.c failobj.c unloadmod.c \ ldconfig.h ldconfig.c cache.c readlib.c readelflib.c \ @@ -89,7 +89,10 @@ distribute := rtld-Rules \ unload4mod1.c unload4mod2.c unload4mod3.c unload4mod4.c \ unload6mod1.c unload6mod2.c unload6mod3.c \ unload7mod1.c unload7mod2.c \ - tst-auditmod1.c tst-audit.sh \ + tst-audit1.c tst-audit2.c tst-audit3.c tst-audit4.c \ + tst-auditmod1.c tst-auditmod3a.c tst-auditmod3b.c \ + tst-auditmod4a.c tst-auditmod4b.c \ + tst-audit5.c tst-auditmod5a.c tst-auditmod5b.c \ order2mod1.c order2mod2.c order2mod3.c order2mod4.c \ tst-stackguard1.c tst-stackguard1-static.c \ tst-array5.c tst-array5-static.c tst-array5dep.c \ @@ -110,7 +113,9 @@ distribute := rtld-Rules \ ifuncdep5.c ifuncdep5pic.c ifuncmod5.c \ ifuncmain6pie.c ifuncmod6.c \ ifuncmain7.c ifuncmain7pic.c ifuncmain7picstatic.c \ - ifuncmain7pie.c ifuncmain7static.c + ifuncmain7pie.c ifuncmain7static.c \ + tst-unique1.c tst-unique1mod1.c tst-unique1mod2.c \ + tst-unique2.c tst-unique2mod1.c tst-unique2mod2.c CFLAGS-dl-runtime.c = -fexceptions -fasynchronous-unwind-tables CFLAGS-dl-lookup.c = -fexceptions -fasynchronous-unwind-tables @@ -189,10 +194,14 @@ tests += loadtest restest1 preloadtest loadfail multiload origtest resolvfail \ tst-dlmopen1 tst-dlmopen2 tst-dlmopen3 \ unload3 unload4 unload5 unload6 unload7 tst-global1 order2 \ tst-audit1 tst-audit2 \ - tst-stackguard1 tst-addr1 tst-thrlock + tst-stackguard1 tst-addr1 tst-thrlock \ + tst-unique1 tst-unique2 # reldep9 test-srcs = tst-pathopt tests-execstack-yes = tst-execstack tst-execstack-needed tst-execstack-prog +ifeq (x86_64,$(config-machine)) +tests += tst-audit3 tst-audit4 tst-audit5 +endif endif ifeq (yesyes,$(have-fpie)$(build-shared)) tests: $(objpfx)tst-pie1.out @@ -234,13 +243,20 @@ modules-names = testobj1 testobj2 testobj3 testobj4 testobj5 testobj6 \ unload4mod1 unload4mod2 unload4mod3 unload4mod4 \ unload6mod1 unload6mod2 unload6mod3 \ unload7mod1 unload7mod2 \ - order2mod1 order2mod2 order2mod3 order2mod4 + order2mod1 order2mod2 order2mod3 order2mod4 \ + tst-unique1mod1 tst-unique1mod2 \ + tst-unique2mod1 tst-unique2mod2 ifeq (yes,$(have-initfini-array)) modules-names += tst-array2dep tst-array5dep endif ifeq (yesyes,$(have-fpie)$(build-shared)) modules-names += tst-piemod1 endif +ifeq (x86_64,$(config-machine)) +modules-names += tst-auditmod3a tst-auditmod3b \ + tst-auditmod4a tst-auditmod4b \ + tst-auditmod5a tst-auditmod5b +endif modules-execstack-yes = tst-execstack-mod extra-test-objs += $(addsuffix .os,$(strip $(modules-names))) # We need this variable to be sure the test modules get the right CPPFLAGS. @@ -959,6 +975,18 @@ tst-audit1-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so $(objpfx)tst-audit2.out: $(objpfx)tst-auditmod1.so tst-audit2-ENV = LD_AUDIT=$(objpfx)tst-auditmod1.so +$(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so +$(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so +tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so + +$(objpfx)tst-audit4: $(objpfx)tst-auditmod4a.so +$(objpfx)tst-audit4.out: $(objpfx)tst-auditmod4b.so +tst-audit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod4b.so + +$(objpfx)tst-audit5: $(objpfx)tst-auditmod5a.so +$(objpfx)tst-audit5.out: $(objpfx)tst-auditmod5b.so +tst-audit5-ENV = LD_AUDIT=$(objpfx)tst-auditmod5b.so + $(objpfx)tst-global1: $(libdl) $(objpfx)tst-global1.out: $(objpfx)testobj6.so $(objpfx)testobj2.so @@ -1094,3 +1122,16 @@ $(objpfx)ifuncmain5pic: $(addprefix $(objpfx),ifuncmod5.so) $(objpfx)ifuncmain5static: $(addprefix $(objpfx),ifuncdep5.o) $(objpfx)ifuncmain5staticpic: $(addprefix $(objpfx),ifuncdep5pic.o) $(objpfx)ifuncmain5picstatic: $(addprefix $(objpfx),ifuncdep5pic.o) + +$(objpfx)tst-unique1: $(libdl) +$(objpfx)tst-unique1.out: $(objpfx)tst-unique1mod1.so \ + $(objpfx)tst-unique1mod2.so + +$(objpfx)tst-unique2: $(libdl) $(objpfx)tst-unique2mod1.so +$(objpfx)tst-unique2.out: $(objpfx)tst-unique2mod2.so + +ifeq (yes,$(config-cflags-avx)) +CFLAGS-tst-audit4.c += -mavx +CFLAGS-tst-auditmod4a.c += -mavx +CFLAGS-tst-auditmod4b.c += -mavx +endif diff --git a/elf/dl-lookup.c b/elf/dl-lookup.c index 707d650719..fd0d624430 100644 --- a/elf/dl-lookup.c +++ b/elf/dl-lookup.c @@ -69,8 +69,371 @@ struct sym_val #endif -/* The actual lookup code. */ -#include "do-lookup.h" +/* Inner part of the lookup functions. We return a value > 0 if we + found the symbol, the value 0 if nothing is found and < 0 if + something bad happened. */ +static int +__attribute_noinline__ +do_lookup_x (const char *undef_name, uint_fast32_t new_hash, + unsigned long int *old_hash, const ElfW(Sym) *ref, + struct sym_val *result, struct r_scope_elem *scope, size_t i, + const struct r_found_version *const version, int flags, + struct link_map *skip, int type_class, struct link_map *undef_map) +{ + size_t n = scope->r_nlist; + /* Make sure we read the value before proceeding. Otherwise we + might use r_list pointing to the initial scope and r_nlist being + the value after a resize. That is the only path in dl-open.c not + protected by GSCOPE. A read barrier here might be to expensive. */ + __asm volatile ("" : "+r" (n), "+m" (scope->r_list)); + struct link_map **list = scope->r_list; + + do + { + /* These variables are used in the nested function. */ + Elf_Symndx symidx; + int num_versions = 0; + const ElfW(Sym) *versioned_sym = NULL; + + const struct link_map *map = list[i]->l_real; + + /* Here come the extra test needed for `_dl_lookup_symbol_skip'. */ + if (map == skip) + continue; + + /* Don't search the executable when resolving a copy reloc. */ + if ((type_class & ELF_RTYPE_CLASS_COPY) && map->l_type == lt_executable) + continue; + + /* Do not look into objects which are going to be removed. */ + if (map->l_removed) + continue; + + /* Print some debugging info if wanted. */ + if (__builtin_expect (GLRO(dl_debug_mask) & DL_DEBUG_SYMBOLS, 0)) + _dl_debug_printf ("symbol=%s; lookup in file=%s [%lu]\n", + undef_name, + map->l_name[0] ? map->l_name : rtld_progname, + map->l_ns); + + /* If the hash table is empty there is nothing to do here. */ + if (map->l_nbuckets == 0) + continue; + + /* The tables for this map. */ + const ElfW(Sym) *symtab = (const void *) D_PTR (map, l_info[DT_SYMTAB]); + const char *strtab = (const void *) D_PTR (map, l_info[DT_STRTAB]); + + + /* Nested routine to check whether the symbol matches. */ + const ElfW(Sym) * + __attribute_noinline__ + check_match (const ElfW(Sym) *sym) + { + unsigned int stt = ELFW(ST_TYPE) (sym->st_info); + assert (ELF_RTYPE_CLASS_PLT == 1); + if (__builtin_expect ((sym->st_value == 0 /* No value. */ + && stt != STT_TLS) + || (type_class & (sym->st_shndx == SHN_UNDEF)), + 0)) + return NULL; + + /* Ignore all but STT_NOTYPE, STT_OBJECT, STT_FUNC, + STT_COMMON, STT_TLS, and STT_GNU_IFUNC since these are no + code/data definitions. */ +#define ALLOWED_STT \ + ((1 << STT_NOTYPE) | (1 << STT_OBJECT) | (1 << STT_FUNC) \ + | (1 << STT_COMMON) | (1 << STT_TLS) | (1 << STT_GNU_IFUNC)) + if (__builtin_expect (((1 << stt) & ALLOWED_STT) == 0, 0)) + return NULL; + + if (sym != ref && strcmp (strtab + sym->st_name, undef_name)) + /* Not the symbol we are looking for. */ + return NULL; + + const ElfW(Half) *verstab = map->l_versyms; + if (version != NULL) + { + if (__builtin_expect (verstab == NULL, 0)) + { + /* We need a versioned symbol but haven't found any. If + this is the object which is referenced in the verneed + entry it is a bug in the library since a symbol must + not simply disappear. + + It would also be a bug in the object since it means that + the list of required versions is incomplete and so the + tests in dl-version.c haven't found a problem.*/ + assert (version->filename == NULL + || ! _dl_name_match_p (version->filename, map)); + + /* Otherwise we accept the symbol. */ + } + else + { + /* We can match the version information or use the + default one if it is not hidden. */ + ElfW(Half) ndx = verstab[symidx] & 0x7fff; + if ((map->l_versions[ndx].hash != version->hash + || strcmp (map->l_versions[ndx].name, version->name)) + && (version->hidden || map->l_versions[ndx].hash + || (verstab[symidx] & 0x8000))) + /* It's not the version we want. */ + return NULL; + } + } + else + { + /* No specific version is selected. There are two ways we + can got here: + + - a binary which does not include versioning information + is loaded + + - dlsym() instead of dlvsym() is used to get a symbol which + might exist in more than one form + + If the library does not provide symbol version information + there is no problem at at: we simply use the symbol if it + is defined. + + These two lookups need to be handled differently if the + library defines versions. In the case of the old + unversioned application the oldest (default) version + should be used. In case of a dlsym() call the latest and + public interface should be returned. */ + if (verstab != NULL) + { + if ((verstab[symidx] & 0x7fff) + >= ((flags & DL_LOOKUP_RETURN_NEWEST) ? 2 : 3)) + { + /* Don't accept hidden symbols. */ + if ((verstab[symidx] & 0x8000) == 0 + && num_versions++ == 0) + /* No version so far. */ + versioned_sym = sym; + + return NULL; + } + } + } + + /* There cannot be another entry for this symbol so stop here. */ + return sym; + } + + const ElfW(Sym) *sym; + const ElfW(Addr) *bitmask = map->l_gnu_bitmask; + if (__builtin_expect (bitmask != NULL, 1)) + { + ElfW(Addr) bitmask_word + = bitmask[(new_hash / __ELF_NATIVE_CLASS) + & map->l_gnu_bitmask_idxbits]; + + unsigned int hashbit1 = new_hash & (__ELF_NATIVE_CLASS - 1); + unsigned int hashbit2 = ((new_hash >> map->l_gnu_shift) + & (__ELF_NATIVE_CLASS - 1)); + + if (__builtin_expect ((bitmask_word >> hashbit1) + & (bitmask_word >> hashbit2) & 1, 0)) + { + Elf32_Word bucket = map->l_gnu_buckets[new_hash + % map->l_nbuckets]; + if (bucket != 0) + { + const Elf32_Word *hasharr = &map->l_gnu_chain_zero[bucket]; + + do + if (((*hasharr ^ new_hash) >> 1) == 0) + { + symidx = hasharr - map->l_gnu_chain_zero; + sym = check_match (&symtab[symidx]); + if (sym != NULL) + goto found_it; + } + while ((*hasharr++ & 1u) == 0); + } + } + /* No symbol found. */ + symidx = SHN_UNDEF; + } + else + { + if (*old_hash == 0xffffffff) + *old_hash = _dl_elf_hash (undef_name); + + /* Use the old SysV-style hash table. Search the appropriate + hash bucket in this object's symbol table for a definition + for the same symbol name. */ + for (symidx = map->l_buckets[*old_hash % map->l_nbuckets]; + symidx != STN_UNDEF; + symidx = map->l_chain[symidx]) + { + sym = check_match (&symtab[symidx]); + if (sym != NULL) + goto found_it; + } + } + + /* If we have seen exactly one versioned symbol while we are + looking for an unversioned symbol and the version is not the + default version we still accept this symbol since there are + no possible ambiguities. */ + sym = num_versions == 1 ? versioned_sym : NULL; + + if (sym != NULL) + { + found_it: + switch (__builtin_expect (ELFW(ST_BIND) (sym->st_info), STB_GLOBAL)) + { + case STB_WEAK: + /* Weak definition. Use this value if we don't find another. */ + if (__builtin_expect (GLRO(dl_dynamic_weak), 0)) + { + if (! result->s) + { + result->s = sym; + result->m = (struct link_map *) map; + } + break; + } + /* FALLTHROUGH */ + case STB_GLOBAL: + success: + /* Global definition. Just what we need. */ + result->s = sym; + result->m = (struct link_map *) map; + return 1; + + case STB_GNU_UNIQUE:; + /* We have to determine whether we already found a + symbol with this name before. If not then we have to + add it to the search table. If we already found a + definition we have to use it. */ + void enter (struct unique_sym *table, size_t size, + unsigned int hash, const char *name, + const ElfW(Sym) *sym, const struct link_map *map) + { + size_t idx = hash % size; + size_t hash2 = 1 + hash % (size - 2); + while (1) + { + if (table[idx].hashval == 0) + { + table[idx].hashval = hash; + table[idx].name = strtab + sym->st_name; + if ((type_class & ELF_RTYPE_CLASS_COPY) != 0) + { + table[idx].sym = ref; + table[idx].map = undef_map; + } + else + { + table[idx].sym = sym; + table[idx].map = map; + } + return; + } + + idx += hash2; + if (idx >= size) + idx -= size; + } + } + + struct unique_sym_table *tab + = &GL(dl_ns)[map->l_ns]._ns_unique_sym_table; + + __rtld_lock_lock_recursive (tab->lock); + + struct unique_sym *entries = tab->entries; + size_t size = tab->size; + if (entries != NULL) + { + size_t idx = new_hash % size; + size_t hash2 = 1 + new_hash % (size - 2); + while (1) + { + if (entries[idx].hashval == new_hash + && strcmp (entries[idx].name, undef_name) == 0) + { + result->s = entries[idx].sym; + result->m = (struct link_map *) entries[idx].map; + __rtld_lock_unlock_recursive (tab->lock); + return 1; + } + + if (entries[idx].hashval == 0 + && entries[idx].name == NULL) + break; + + idx += hash2; + if (idx >= size) + idx -= size; + } + + if (size * 3 <= tab->n_elements) + { + /* Expand the table. */ + size_t newsize = _dl_higher_prime_number (size); + struct unique_sym *newentries + = calloc (sizeof (struct unique_sym), newsize); + if (newentries == NULL) + { + nomem: + __rtld_lock_unlock_recursive (tab->lock); + _dl_fatal_printf ("out of memory\n"); + } + + for (idx = 0; idx < size; ++idx) + if (entries[idx].hashval != 0) + enter (newentries, newsize, entries[idx].hashval, + entries[idx].name, entries[idx].sym, + entries[idx].map); + + tab->free (entries); + tab->size = newsize; + entries = tab->entries = newentries; + tab->free = free; + } + } + else + { +#define INITIAL_NUNIQUE_SYM_TABLE 31 + size = INITIAL_NUNIQUE_SYM_TABLE; + entries = calloc (sizeof (struct unique_sym), size); + if (entries == NULL) + goto nomem; + + tab->entries = entries; + tab->size = size; + tab->free = free; + } + + enter (entries, size, new_hash, strtab + sym->st_name, sym, map); + ++tab->n_elements; + + __rtld_lock_unlock_recursive (tab->lock); + + goto success; + + default: + /* Local symbols are ignored. */ + break; + } + } + + /* If this current map is the one mentioned in the verneed entry + and we have not found a weak entry, it is a bug. */ + if (symidx == STN_UNDEF && version != NULL && version->filename != NULL + && __builtin_expect (_dl_name_match_p (version->filename, map), 0)) + return -1; + } + while (++i < n); + + /* We have not found anything until now. */ + return 0; +} static uint_fast32_t @@ -337,7 +700,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map, { int res = do_lookup_x (undef_name, new_hash, &old_hash, *ref, ¤t_value, *scope, start, version, flags, - skip_map, type_class); + skip_map, type_class, undef_map); if (res > 0) break; @@ -410,7 +773,7 @@ _dl_lookup_symbol_x (const char *undef_name, struct link_map *undef_map, for (scope = symbol_scope; *scope != NULL; i = 0, ++scope) if (do_lookup_x (undef_name, new_hash, &old_hash, *ref, &protected_value, *scope, i, version, flags, - skip_map, ELF_RTYPE_CLASS_PLT) != 0) + skip_map, ELF_RTYPE_CLASS_PLT, NULL) != 0) break; if (protected_value.s != NULL && protected_value.m != undef_map) @@ -536,7 +899,7 @@ _dl_debug_bindings (const char *undef_name, struct link_map *undef_map, do_lookup_x (undef_name, new_hash, &old_hash, *ref, &val, undef_map->l_local_scope[0], 0, version, 0, NULL, - type_class); + type_class, undef_map); if (val.s != value->s || val.m != value->m) conflict = 1; diff --git a/elf/dl-misc.c b/elf/dl-misc.c index 7c77cd040f..d50537ade7 100644 --- a/elf/dl-misc.c +++ b/elf/dl-misc.c @@ -312,3 +312,68 @@ _dl_name_match_p (const char *name, const struct link_map *map) return 0; } + + +unsigned long int +internal_function +_dl_higher_prime_number (unsigned long int n) +{ + /* These are primes that are near, but slightly smaller than, a + power of two. */ + static const uint32_t primes[] = { + UINT32_C (7), + UINT32_C (13), + UINT32_C (31), + UINT32_C (61), + UINT32_C (127), + UINT32_C (251), + UINT32_C (509), + UINT32_C (1021), + UINT32_C (2039), + UINT32_C (4093), + UINT32_C (8191), + UINT32_C (16381), + UINT32_C (32749), + UINT32_C (65521), + UINT32_C (131071), + UINT32_C (262139), + UINT32_C (524287), + UINT32_C (1048573), + UINT32_C (2097143), + UINT32_C (4194301), + UINT32_C (8388593), + UINT32_C (16777213), + UINT32_C (33554393), + UINT32_C (67108859), + UINT32_C (134217689), + UINT32_C (268435399), + UINT32_C (536870909), + UINT32_C (1073741789), + UINT32_C (2147483647), + /* 4294967291L */ + UINT32_C (2147483647) + UINT32_C (2147483644) + }; + + const uint32_t *low = &primes[0]; + const uint32_t *high = &primes[sizeof (primes) / sizeof (primes[0])]; + + while (low != high) + { + const uint32_t *mid = low + (high - low) / 2; + if (n > *mid) + low = mid + 1; + else + high = mid; + } + +#if 0 + /* If we've run out of primes, abort. */ + if (n > *low) + { + fprintf (stderr, "Cannot find prime bigger than %lu\n", n); + abort (); + } +#endif + + return *low; +} diff --git a/elf/dl-open.c b/elf/dl-open.c index c3f0e42d5e..b8ebfe0e60 100644 --- a/elf/dl-open.c +++ b/elf/dl-open.c @@ -569,7 +569,7 @@ _dl_open (const char *file, int mode, const void *caller_dlopen, Lmid_t nsid, if (GL(dl_ns)[nsid]._ns_loaded == NULL) break; - if (nsid == DL_NNS) + if (__builtin_expect (nsid == DL_NNS, 0)) { /* No more namespace available. */ __rtld_lock_unlock_recursive (GL(dl_load_lock)); @@ -579,7 +579,10 @@ no more namespaces available for dlmopen()")); } if (nsid == GL(dl_nns)) - ++GL(dl_nns); + { + __rtld_lock_initialize (GL(dl_ns)[nsid]._ns_unique_sym_table.lock); + ++GL(dl_nns); + } _dl_debug_initialize (0, nsid)->r_state = RT_CONSISTENT; } diff --git a/elf/do-lookup.h b/elf/do-lookup.h deleted file mode 100644 index ae74da4846..0000000000 --- a/elf/do-lookup.h +++ /dev/null @@ -1,271 +0,0 @@ -/* Look up a symbol in the loaded objects. - Copyright (C) 1995-2007, 2008, 2009 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - - -/* Inner part of the lookup functions. We return a value > 0 if we - found the symbol, the value 0 if nothing is found and < 0 if - something bad happened. */ -static int -__attribute_noinline__ -do_lookup_x (const char *undef_name, uint_fast32_t new_hash, - unsigned long int *old_hash, const ElfW(Sym) *ref, - struct sym_val *result, struct r_scope_elem *scope, size_t i, - const struct r_found_version *const version, int flags, - struct link_map *skip, int type_class) -{ - size_t n = scope->r_nlist; - /* Make sure we read the value before proceeding. Otherwise we - might use r_list pointing to the initial scope and r_nlist being - the value after a resize. That is the only path in dl-open.c not - protected by GSCOPE. A read barrier here might be to expensive. */ - __asm volatile ("" : "+r" (n), "+m" (scope->r_list)); - struct link_map **list = scope->r_list; - - do - { - /* These variables are used in the nested function. */ - Elf_Symndx symidx; - int num_versions = 0; - const ElfW(Sym) *versioned_sym = NULL; - - const struct link_map *map = list[i]->l_real; - - /* Here come the extra test needed for `_dl_lookup_symbol_skip'. */ - if (map == skip) - continue; - - /* Don't search the executable when resolving a copy reloc. */ - if ((type_class & ELF_RTYPE_CLASS_COPY) && map->l_type == lt_executable) - continue; - - /* Do not look into objects which are going to be removed. */ - if (map->l_removed) - continue; - - /* Print some debugging info if wanted. */ - if (__builtin_expect (GLRO(dl_debug_mask) & DL_DEBUG_SYMBOLS, 0)) - _dl_debug_printf ("symbol=%s; lookup in file=%s [%lu]\n", - undef_name, - map->l_name[0] ? map->l_name : rtld_progname, - map->l_ns); - - /* If the hash table is empty there is nothing to do here. */ - if (map->l_nbuckets == 0) - continue; - - /* The tables for this map. */ - const ElfW(Sym) *symtab = (const void *) D_PTR (map, l_info[DT_SYMTAB]); - const char *strtab = (const void *) D_PTR (map, l_info[DT_STRTAB]); - - - /* Nested routine to check whether the symbol matches. */ - const ElfW(Sym) * - __attribute_noinline__ - check_match (const ElfW(Sym) *sym) - { - unsigned int stt = ELFW(ST_TYPE) (sym->st_info); - assert (ELF_RTYPE_CLASS_PLT == 1); - if (__builtin_expect ((sym->st_value == 0 /* No value. */ - && stt != STT_TLS) - || (type_class & (sym->st_shndx == SHN_UNDEF)), - 0)) - return NULL; - - if (__builtin_expect (stt > STT_FUNC - && stt != STT_COMMON - && stt != STT_TLS - && stt != STT_GNU_IFUNC, 0)) - /* Ignore all but STT_NOTYPE, STT_OBJECT, STT_FUNC, STT_COMMON, - STT_TLS, and STT_GNU_IFUNC since these are no code/data - definitions. */ - return NULL; - - if (sym != ref && strcmp (strtab + sym->st_name, undef_name)) - /* Not the symbol we are looking for. */ - return NULL; - - const ElfW(Half) *verstab = map->l_versyms; - if (version != NULL) - { - if (__builtin_expect (verstab == NULL, 0)) - { - /* We need a versioned symbol but haven't found any. If - this is the object which is referenced in the verneed - entry it is a bug in the library since a symbol must - not simply disappear. - - It would also be a bug in the object since it means that - the list of required versions is incomplete and so the - tests in dl-version.c haven't found a problem.*/ - assert (version->filename == NULL - || ! _dl_name_match_p (version->filename, map)); - - /* Otherwise we accept the symbol. */ - } - else - { - /* We can match the version information or use the - default one if it is not hidden. */ - ElfW(Half) ndx = verstab[symidx] & 0x7fff; - if ((map->l_versions[ndx].hash != version->hash - || strcmp (map->l_versions[ndx].name, version->name)) - && (version->hidden || map->l_versions[ndx].hash - || (verstab[symidx] & 0x8000))) - /* It's not the version we want. */ - return NULL; - } - } - else - { - /* No specific version is selected. There are two ways we - can got here: - - - a binary which does not include versioning information - is loaded - - - dlsym() instead of dlvsym() is used to get a symbol which - might exist in more than one form - - If the library does not provide symbol version information - there is no problem at at: we simply use the symbol if it - is defined. - - These two lookups need to be handled differently if the - library defines versions. In the case of the old - unversioned application the oldest (default) version - should be used. In case of a dlsym() call the latest and - public interface should be returned. */ - if (verstab != NULL) - { - if ((verstab[symidx] & 0x7fff) - >= ((flags & DL_LOOKUP_RETURN_NEWEST) ? 2 : 3)) - { - /* Don't accept hidden symbols. */ - if ((verstab[symidx] & 0x8000) == 0 - && num_versions++ == 0) - /* No version so far. */ - versioned_sym = sym; - - return NULL; - } - } - } - - /* There cannot be another entry for this symbol so stop here. */ - return sym; - } - - const ElfW(Sym) *sym; - const ElfW(Addr) *bitmask = map->l_gnu_bitmask; - if (__builtin_expect (bitmask != NULL, 1)) - { - ElfW(Addr) bitmask_word - = bitmask[(new_hash / __ELF_NATIVE_CLASS) - & map->l_gnu_bitmask_idxbits]; - - unsigned int hashbit1 = new_hash & (__ELF_NATIVE_CLASS - 1); - unsigned int hashbit2 = ((new_hash >> map->l_gnu_shift) - & (__ELF_NATIVE_CLASS - 1)); - - if (__builtin_expect ((bitmask_word >> hashbit1) - & (bitmask_word >> hashbit2) & 1, 0)) - { - Elf32_Word bucket = map->l_gnu_buckets[new_hash - % map->l_nbuckets]; - if (bucket != 0) - { - const Elf32_Word *hasharr = &map->l_gnu_chain_zero[bucket]; - - do - if (((*hasharr ^ new_hash) >> 1) == 0) - { - symidx = hasharr - map->l_gnu_chain_zero; - sym = check_match (&symtab[symidx]); - if (sym != NULL) - goto found_it; - } - while ((*hasharr++ & 1u) == 0); - } - } - /* No symbol found. */ - symidx = SHN_UNDEF; - } - else - { - if (*old_hash == 0xffffffff) - *old_hash = _dl_elf_hash (undef_name); - - /* Use the old SysV-style hash table. Search the appropriate - hash bucket in this object's symbol table for a definition - for the same symbol name. */ - for (symidx = map->l_buckets[*old_hash % map->l_nbuckets]; - symidx != STN_UNDEF; - symidx = map->l_chain[symidx]) - { - sym = check_match (&symtab[symidx]); - if (sym != NULL) - goto found_it; - } - } - - /* If we have seen exactly one versioned symbol while we are - looking for an unversioned symbol and the version is not the - default version we still accept this symbol since there are - no possible ambiguities. */ - sym = num_versions == 1 ? versioned_sym : NULL; - - if (sym != NULL) - { - found_it: - switch (ELFW(ST_BIND) (sym->st_info)) - { - case STB_WEAK: - /* Weak definition. Use this value if we don't find another. */ - if (__builtin_expect (GLRO(dl_dynamic_weak), 0)) - { - if (! result->s) - { - result->s = sym; - result->m = (struct link_map *) map; - } - break; - } - /* FALLTHROUGH */ - case STB_GLOBAL: - /* Global definition. Just what we need. */ - result->s = sym; - result->m = (struct link_map *) map; - return 1; - default: - /* Local symbols are ignored. */ - break; - } - } - - /* If this current map is the one mentioned in the verneed entry - and we have not found a weak entry, it is a bug. */ - if (symidx == STN_UNDEF && version != NULL && version->filename != NULL - && __builtin_expect (_dl_name_match_p (version->filename, map), 0)) - return -1; - } - while (++i < n); - - /* We have not found anything until now. */ - return 0; -} diff --git a/elf/elf.h b/elf/elf.h index 8fdf74b099..7efdedefb4 100644 --- a/elf/elf.h +++ b/elf/elf.h @@ -444,6 +444,7 @@ typedef struct #define STB_WEAK 2 /* Weak symbol */ #define STB_NUM 3 /* Number of defined types. */ #define STB_LOOS 10 /* Start of OS-specific */ +#define STB_GNU_UNIQUE 10 /* Unique symbol. */ #define STB_HIOS 12 /* End of OS-specific */ #define STB_LOPROC 13 /* Start of processor-specific */ #define STB_HIPROC 15 /* End of processor-specific */ diff --git a/elf/rtld.c b/elf/rtld.c index f97de9ac08..55b84c3bf4 100644 --- a/elf/rtld.c +++ b/elf/rtld.c @@ -127,7 +127,12 @@ struct rtld_global _rtld_global = #ifdef _LIBC_REENTRANT ._dl_load_lock = _RTLD_LOCK_RECURSIVE_INITIALIZER, #endif - ._dl_nns = 1 + ._dl_nns = 1, + ._dl_ns = + { + [LM_ID_BASE] = { ._ns_unique_sym_table + = { .lock = _RTLD_LOCK_RECURSIVE_INITIALIZER } } + } }; /* If we would use strong_alias here the compiler would see a non-hidden definition. This would undo the effect of the previous diff --git a/elf/tst-audit3.c b/elf/tst-audit3.c new file mode 100644 index 0000000000..d00db9972b --- /dev/null +++ b/elf/tst-audit3.c @@ -0,0 +1,20 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> + +#include <emmintrin.h> + +extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i, + __m128i, __m128i, __m128i, __m128i); +int +main (void) +{ + __m128i xmm = _mm_setzero_si128 (); + __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm); + + if (memcmp (&xmm, &ret, sizeof (ret))) + abort (); + + return 0; +} diff --git a/elf/tst-audit4.c b/elf/tst-audit4.c new file mode 100644 index 0000000000..b17d4a61a7 --- /dev/null +++ b/elf/tst-audit4.c @@ -0,0 +1,35 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#ifdef __AVX__ +#include <stdlib.h> +#include <string.h> +#include <cpuid.h> +#include <immintrin.h> + +extern __m256i audit_test (__m256i, __m256i, __m256i, __m256i, + __m256i, __m256i, __m256i, __m256i); +int +main (void) +{ + unsigned int eax, ebx, ecx, edx; + + /* Run AVX test only if AVX is supported. */ + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_AVX)) + { + __m256i ymm = _mm256_setzero_si256 (); + __m256i ret = audit_test (ymm, ymm, ymm, ymm, ymm, ymm, ymm, ymm); + + ymm = _mm256_set1_epi32 (0x12349876); + if (memcmp (&ymm, &ret, sizeof (ret))) + abort (); + } + return 0; +} +#else +int +main (void) +{ + return 0; +} +#endif diff --git a/elf/tst-audit5.c b/elf/tst-audit5.c new file mode 100644 index 0000000000..0094fee61f --- /dev/null +++ b/elf/tst-audit5.c @@ -0,0 +1,21 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> + +#include <emmintrin.h> + +extern __m128i audit_test (__m128i, __m128i, __m128i, __m128i, + __m128i, __m128i, __m128i, __m128i); +int +main (void) +{ + __m128i xmm = _mm_setzero_si128 (); + __m128i ret = audit_test (xmm, xmm, xmm, xmm, xmm, xmm, xmm, xmm); + + xmm = _mm_set1_epi32 (0x12349876); + if (memcmp (&xmm, &ret, sizeof (ret))) + abort (); + + return 0; +} diff --git a/elf/tst-auditmod3a.c b/elf/tst-auditmod3a.c new file mode 100644 index 0000000000..9514aba505 --- /dev/null +++ b/elf/tst-auditmod3a.c @@ -0,0 +1,24 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> +#include <emmintrin.h> + +__m128i +audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3, + __m128i x4, __m128i x5, __m128i x6, __m128i x7) +{ + __m128i xmm = _mm_setzero_si128 (); + + if (memcmp (&xmm, &x0, sizeof (xmm)) + || memcmp (&xmm, &x1, sizeof (xmm)) + || memcmp (&xmm, &x2, sizeof (xmm)) + || memcmp (&xmm, &x3, sizeof (xmm)) + || memcmp (&xmm, &x4, sizeof (xmm)) + || memcmp (&xmm, &x5, sizeof (xmm)) + || memcmp (&xmm, &x6, sizeof (xmm)) + || memcmp (&xmm, &x7, sizeof (xmm))) + abort (); + + return xmm; +} diff --git a/elf/tst-auditmod3b.c b/elf/tst-auditmod3b.c new file mode 100644 index 0000000000..388ed6e49c --- /dev/null +++ b/elf/tst-auditmod3b.c @@ -0,0 +1,156 @@ +/* Verify that changing xmm registers in audit library won't affect + function parameter passing/return. */ + +#include <dlfcn.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> +#include <emmintrin.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind32 (Elf32_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind32: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#define pltenter la_x86_64_gnu_pltenter +#define pltexit la_x86_64_gnu_pltexit +#define La_regs La_x86_64_regs +#define La_retval La_x86_64_retval +#define int_retval lrv_rax + +#include <tst-audit.h> + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + __m128i xmm = _mm_set1_epi32 (-1); + asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" ); + asm volatile ("movdqa %0, %%xmm2" : : "x" (xmm) : "xmm2" ); + asm volatile ("movdqa %0, %%xmm3" : : "x" (xmm) : "xmm3" ); + asm volatile ("movdqa %0, %%xmm4" : : "x" (xmm) : "xmm4" ); + asm volatile ("movdqa %0, %%xmm5" : : "x" (xmm) : "xmm5" ); + asm volatile ("movdqa %0, %%xmm6" : : "x" (xmm) : "xmm6" ); + asm volatile ("movdqa %0, %%xmm7" : : "x" (xmm) : "xmm7" ); + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, outregs->int_retval); + + __m128i xmm = _mm_set1_epi32 (-1); + asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" ); + asm volatile ("movdqa %0, %%xmm2" : : "x" (xmm) : "xmm2" ); + asm volatile ("movdqa %0, %%xmm3" : : "x" (xmm) : "xmm3" ); + asm volatile ("movdqa %0, %%xmm4" : : "x" (xmm) : "xmm4" ); + asm volatile ("movdqa %0, %%xmm5" : : "x" (xmm) : "xmm5" ); + asm volatile ("movdqa %0, %%xmm6" : : "x" (xmm) : "xmm6" ); + asm volatile ("movdqa %0, %%xmm7" : : "x" (xmm) : "xmm7" ); + + return 0; +} diff --git a/elf/tst-auditmod4a.c b/elf/tst-auditmod4a.c new file mode 100644 index 0000000000..c9c24c04a8 --- /dev/null +++ b/elf/tst-auditmod4a.c @@ -0,0 +1,48 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#ifdef __AVX__ +#include <stdlib.h> +#include <string.h> +#include <immintrin.h> + +__m256i +audit_test (__m256i x0, __m256i x1, __m256i x2, __m256i x3, + __m256i x4, __m256i x5, __m256i x6, __m256i x7) +{ + __m256i ymm; + + ymm = _mm256_set1_epi32 (1); + if (memcmp (&ymm, &x0, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (2); + if (memcmp (&ymm, &x1, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (3); + if (memcmp (&ymm, &x2, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (4); + if (memcmp (&ymm, &x3, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (5); + if (memcmp (&ymm, &x4, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (6); + if (memcmp (&ymm, &x5, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (7); + if (memcmp (&ymm, &x6, sizeof (ymm))) + abort (); + + ymm = _mm256_set1_epi32 (8); + if (memcmp (&ymm, &x7, sizeof (ymm))) + abort (); + + return _mm256_setzero_si256 (); +} +#endif diff --git a/elf/tst-auditmod4b.c b/elf/tst-auditmod4b.c new file mode 100644 index 0000000000..a6d3c6a6c5 --- /dev/null +++ b/elf/tst-auditmod4b.c @@ -0,0 +1,206 @@ +/* Verify that changing AVX registers in audit library won't affect + function parameter passing/return. */ + +#include <dlfcn.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#define pltenter la_x86_64_gnu_pltenter +#define pltexit la_x86_64_gnu_pltexit +#define La_regs La_x86_64_regs +#define La_retval La_x86_64_retval +#define int_retval lrv_rax + +#include <tst-audit.h> + +#ifdef __AVX__ +#include <immintrin.h> +#include <cpuid.h> + +static int avx = -1; + +static int +__attribute ((always_inline)) +check_avx (void) +{ + if (avx == -1) + { + unsigned int eax, ebx, ecx, edx; + + if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) + && (ecx & bit_AVX)) + avx = 1; + else + avx = 0; + } + return avx; +} +#else +#include <emmintrin.h> +#endif + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + __m256i zero = _mm256_setzero_si256 (); + if (memcmp (®s->lr_vector[0], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[1], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[2], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[3], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[4], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[5], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[6], &zero, sizeof (zero)) + || memcmp (®s->lr_vector[7], &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + regs->lr_vector[i].ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (i + 1); + + __m256i ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + asm volatile ("vmovdqa %0, %%ymm2" : : "x" (ymm) : "xmm2" ); + asm volatile ("vmovdqa %0, %%ymm3" : : "x" (ymm) : "xmm3" ); + asm volatile ("vmovdqa %0, %%ymm4" : : "x" (ymm) : "xmm4" ); + asm volatile ("vmovdqa %0, %%ymm5" : : "x" (ymm) : "xmm5" ); + asm volatile ("vmovdqa %0, %%ymm6" : : "x" (ymm) : "xmm6" ); + asm volatile ("vmovdqa %0, %%ymm7" : : "x" (ymm) : "xmm7" ); + + *framesizep = 1024; + } +#endif + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, outregs->int_retval); + +#ifdef __AVX__ + if (check_avx () && strcmp (symname, "audit_test") == 0) + { + __m256i zero = _mm256_setzero_si256 (); + if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + { + __m256i ymm = _mm256_set1_epi32 (i + 1); + if (memcmp (&inregs->lr_vector[i], &ymm, sizeof (ymm)) != 0) + abort (); + } + + outregs->lrv_vector0.ymm[0] + = (La_x86_64_ymm) _mm256_set1_epi32 (0x12349876); + + __m256i ymm = _mm256_set1_epi32 (-1); + asm volatile ("vmovdqa %0, %%ymm0" : : "x" (ymm) : "xmm0" ); + asm volatile ("vmovdqa %0, %%ymm1" : : "x" (ymm) : "xmm1" ); + } +#endif + + return 0; +} diff --git a/elf/tst-auditmod5a.c b/elf/tst-auditmod5a.c new file mode 100644 index 0000000000..8511a70747 --- /dev/null +++ b/elf/tst-auditmod5a.c @@ -0,0 +1,46 @@ +/* Test case for x86-64 preserved registers in dynamic linker. */ + +#include <stdlib.h> +#include <string.h> +#include <emmintrin.h> + +__m128i +audit_test (__m128i x0, __m128i x1, __m128i x2, __m128i x3, + __m128i x4, __m128i x5, __m128i x6, __m128i x7) +{ + __m128i xmm; + + xmm = _mm_set1_epi32 (1); + if (memcmp (&xmm, &x0, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (2); + if (memcmp (&xmm, &x1, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (3); + if (memcmp (&xmm, &x2, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (4); + if (memcmp (&xmm, &x3, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (5); + if (memcmp (&xmm, &x4, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (6); + if (memcmp (&xmm, &x5, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (7); + if (memcmp (&xmm, &x6, sizeof (xmm))) + abort (); + + xmm = _mm_set1_epi32 (8); + if (memcmp (&xmm, &x7, sizeof (xmm))) + abort (); + + return _mm_setzero_si128 (); +} diff --git a/elf/tst-auditmod5b.c b/elf/tst-auditmod5b.c new file mode 100644 index 0000000000..7e1e941126 --- /dev/null +++ b/elf/tst-auditmod5b.c @@ -0,0 +1,178 @@ +/* Verify that changing xmm registers in audit library won't affect + function parameter passing/return. */ + +#include <dlfcn.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <bits/wordsize.h> +#include <gnu/lib-names.h> +#include <emmintrin.h> + +unsigned int +la_version (unsigned int v) +{ + setlinebuf (stdout); + + printf ("version: %u\n", v); + + char buf[20]; + sprintf (buf, "%u", v); + + return v; +} + +void +la_activity (uintptr_t *cookie, unsigned int flag) +{ + if (flag == LA_ACT_CONSISTENT) + printf ("activity: consistent\n"); + else if (flag == LA_ACT_ADD) + printf ("activity: add\n"); + else if (flag == LA_ACT_DELETE) + printf ("activity: delete\n"); + else + printf ("activity: unknown activity %u\n", flag); +} + +char * +la_objsearch (const char *name, uintptr_t *cookie, unsigned int flag) +{ + char buf[100]; + const char *flagstr; + if (flag == LA_SER_ORIG) + flagstr = "LA_SET_ORIG"; + else if (flag == LA_SER_LIBPATH) + flagstr = "LA_SER_LIBPATH"; + else if (flag == LA_SER_RUNPATH) + flagstr = "LA_SER_RUNPATH"; + else if (flag == LA_SER_CONFIG) + flagstr = "LA_SER_CONFIG"; + else if (flag == LA_SER_DEFAULT) + flagstr = "LA_SER_DEFAULT"; + else if (flag == LA_SER_SECURE) + flagstr = "LA_SER_SECURE"; + else + { + sprintf (buf, "unknown flag %d", flag); + flagstr = buf; + } + printf ("objsearch: %s, %s\n", name, flagstr); + + return (char *) name; +} + +unsigned int +la_objopen (struct link_map *l, Lmid_t lmid, uintptr_t *cookie) +{ + printf ("objopen: %ld, %s\n", lmid, l->l_name); + + return 3; +} + +void +la_preinit (uintptr_t *cookie) +{ + printf ("preinit\n"); +} + +unsigned int +la_objclose (uintptr_t *cookie) +{ + printf ("objclose\n"); + return 0; +} + +uintptr_t +la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, unsigned int *flags, const char *symname) +{ + printf ("symbind64: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + return sym->st_value; +} + +#define pltenter la_x86_64_gnu_pltenter +#define pltexit la_x86_64_gnu_pltexit +#define La_regs La_x86_64_regs +#define La_retval La_x86_64_retval +#define int_retval lrv_rax + +#include <tst-audit.h> + +ElfW(Addr) +pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, La_regs *regs, unsigned int *flags, + const char *symname, long int *framesizep) +{ + printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n", + symname, (long int) sym->st_value, ndx, *flags); + + __m128i minusone = _mm_set1_epi32 (-1); + + if (strcmp (symname, "audit_test") == 0) + { + __m128i zero = _mm_setzero_si128 (); + if (memcmp (®s->lr_xmm[0], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[1], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[2], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[3], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[4], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[5], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[6], &zero, sizeof (zero)) + || memcmp (®s->lr_xmm[7], &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + regs->lr_xmm[i] = (La_x86_64_xmm) _mm_set1_epi32 (i + 1); + + *framesizep = 1024; + } + + asm volatile ("movdqa %0, %%xmm0" : : "x" (minusone) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (minusone) : "xmm1" ); + asm volatile ("movdqa %0, %%xmm2" : : "x" (minusone) : "xmm2" ); + asm volatile ("movdqa %0, %%xmm3" : : "x" (minusone) : "xmm3" ); + asm volatile ("movdqa %0, %%xmm4" : : "x" (minusone) : "xmm4" ); + asm volatile ("movdqa %0, %%xmm5" : : "x" (minusone) : "xmm5" ); + asm volatile ("movdqa %0, %%xmm6" : : "x" (minusone) : "xmm6" ); + asm volatile ("movdqa %0, %%xmm7" : : "x" (minusone) : "xmm7" ); + + return sym->st_value; +} + +unsigned int +pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook, + uintptr_t *defcook, const La_regs *inregs, La_retval *outregs, + const char *symname) +{ + printf ("pltexit: symname=%s, st_value=%#lx, ndx=%u, retval=%tu\n", + symname, (long int) sym->st_value, ndx, outregs->int_retval); + + __m128i xmm; + + if (strcmp (symname, "audit_test") == 0) + { + __m128i zero = _mm_setzero_si128 (); + if (memcmp (&outregs->lrv_xmm0, &zero, sizeof (zero))) + abort (); + + for (int i = 0; i < 8; i++) + { + xmm = _mm_set1_epi32 (i + 1); + if (memcmp (&inregs->lr_xmm[i], &xmm, sizeof (xmm)) != 0) + abort (); + } + + outregs->lrv_xmm0 = (La_x86_64_xmm) _mm_set1_epi32 (0x12349876); + } + + xmm = _mm_set1_epi32 (-1); + asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" ); + asm volatile ("movdqa %0, %%xmm1" : : "x" (xmm) : "xmm1" ); + + return 0; +} diff --git a/elf/tst-unique1.c b/elf/tst-unique1.c new file mode 100644 index 0000000000..9b7996cd96 --- /dev/null +++ b/elf/tst-unique1.c @@ -0,0 +1,40 @@ +#include <config.h> +#include <dlfcn.h> +#include <stdio.h> + +static int +do_test (void) +{ +#ifdef HAVE_ASM_UNIQUE_OBJECT + void *h1 = dlopen ("tst-unique1mod1.so", RTLD_LAZY); + if (h1 == NULL) + { + puts ("cannot load tst-unique1mod1"); + return 1; + } + int *(*f1) (void) = dlsym (h1, "f"); + if (f1 == NULL) + { + puts ("cannot locate f in tst-unique1mod1"); + return 1; + } + void *h2 = dlopen ("tst-unique1mod2.so", RTLD_LAZY); + if (h2 == NULL) + { + puts ("cannot load tst-unique1mod2"); + return 1; + } + int (*f2) (int *) = dlsym (h2, "f"); + if (f2 == NULL) + { + puts ("cannot locate f in tst-unique1mod2"); + return 1; + } + return f2 (f1 ()); +#else + return 0; +#endif +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/elf/tst-unique1mod1.c b/elf/tst-unique1mod1.c new file mode 100644 index 0000000000..16de28d25e --- /dev/null +++ b/elf/tst-unique1mod1.c @@ -0,0 +1,21 @@ +#include <config.h> + +#ifdef HAVE_ASM_UNIQUE_OBJECT +# define S(s) _S (s) +# define _S(s) #s + +asm (".data;" + S (ASM_GLOBAL_DIRECTIVE) " var\n" + ".type var, " S (ASM_TYPE_DIRECTIVE_PREFIX) "gnu_unique_object\n" + ".size var, 4\n" + "var:.zero 4\n" + ".previous"); +extern int var; + +int * +f (void) +{ + var = 1; + return &var; +} +#endif diff --git a/elf/tst-unique1mod2.c b/elf/tst-unique1mod2.c new file mode 100644 index 0000000000..c075515827 --- /dev/null +++ b/elf/tst-unique1mod2.c @@ -0,0 +1,20 @@ +#include <config.h> + +#ifdef HAVE_ASM_UNIQUE_OBJECT +# define S(s) _S (s) +# define _S(s) #s + +asm (".data;" + S (ASM_GLOBAL_DIRECTIVE) " var\n" + ".type var, " S (ASM_TYPE_DIRECTIVE_PREFIX) "gnu_unique_object\n" + ".size var, 4\n" + "var:.zero 4\n" + ".previous"); +extern int var; + +int +f (int *p) +{ + return &var != p || *p != 1; +} +#endif diff --git a/elf/tst-unique2.c b/elf/tst-unique2.c new file mode 100644 index 0000000000..7bb0687364 --- /dev/null +++ b/elf/tst-unique2.c @@ -0,0 +1,32 @@ +#include <config.h> +#include <dlfcn.h> +#include <stdio.h> + +extern int var; + +static int +do_test (void) +{ +#ifdef HAVE_ASM_UNIQUE_OBJECT + var = 1; + + void *h = dlopen ("tst-unique2mod2.so", RTLD_LAZY); + if (h == NULL) + { + puts ("cannot load tst-unique2mod2"); + return 1; + } + int (*f) (int *) = dlsym (h, "f"); + if (f == NULL) + { + puts ("cannot locate f in tst-unique2mod2"); + return 1; + } + return f (&var); +#else + return 0; +#endif +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/elf/tst-unique2mod1.c b/elf/tst-unique2mod1.c new file mode 100644 index 0000000000..5e4ac4d68c --- /dev/null +++ b/elf/tst-unique2mod1.c @@ -0,0 +1,13 @@ +#include <config.h> + +#ifdef HAVE_ASM_UNIQUE_OBJECT +# define S(s) _S (s) +# define _S(s) #s + +asm (".data;" + S (ASM_GLOBAL_DIRECTIVE) " var\n" + ".type var, " S (ASM_TYPE_DIRECTIVE_PREFIX) "gnu_unique_object\n" + ".size var, 4\n" + "var:.zero 4\n" + ".previous"); +#endif diff --git a/elf/tst-unique2mod2.c b/elf/tst-unique2mod2.c new file mode 100644 index 0000000000..c075515827 --- /dev/null +++ b/elf/tst-unique2mod2.c @@ -0,0 +1,20 @@ +#include <config.h> + +#ifdef HAVE_ASM_UNIQUE_OBJECT +# define S(s) _S (s) +# define _S(s) #s + +asm (".data;" + S (ASM_GLOBAL_DIRECTIVE) " var\n" + ".type var, " S (ASM_TYPE_DIRECTIVE_PREFIX) "gnu_unique_object\n" + ".size var, 4\n" + "var:.zero 4\n" + ".previous"); +extern int var; + +int +f (int *p) +{ + return &var != p || *p != 1; +} +#endif diff --git a/include/atomic.h b/include/atomic.h index 9366f78734..37d0111d5f 100644 --- a/include/atomic.h +++ b/include/atomic.h @@ -107,14 +107,19 @@ #endif -#ifndef atomic_compare_and_exchange_val_rel -# define atomic_compare_and_exchange_val_rel(mem, newval, oldval) \ - atomic_compare_and_exchange_val_acq (mem, newval, oldval) +#ifndef catomic_compare_and_exchange_val_rel +# ifndef atomic_compare_and_exchange_val_rel +# define catomic_compare_and_exchange_val_rel(mem, newval, oldval) \ + catomic_compare_and_exchange_val_acq (mem, newval, oldval) +# else +# define catomic_compare_and_exchange_val_rel(mem, newval, oldval) \ + atomic_compare_and_exchange_val_rel (mem, newval, oldval) +# endif #endif -#ifndef catomic_compare_and_exchange_val_rel -# define catomic_compare_and_exchange_val_rel(mem, newval, oldval) \ +#ifndef atomic_compare_and_exchange_val_rel +# define atomic_compare_and_exchange_val_rel(mem, newval, oldval) \ atomic_compare_and_exchange_val_acq (mem, newval, oldval) #endif @@ -155,15 +160,20 @@ #endif -#ifndef atomic_compare_and_exchange_bool_rel -# define atomic_compare_and_exchange_bool_rel(mem, newval, oldval) \ - atomic_compare_and_exchange_bool_acq (mem, newval, oldval) +#ifndef catomic_compare_and_exchange_bool_rel +# ifndef atomic_compare_and_exchange_bool_rel +# define catomic_compare_and_exchange_bool_rel(mem, newval, oldval) \ + catomic_compare_and_exchange_bool_acq (mem, newval, oldval) +# else +# define catomic_compare_and_exchange_bool_rel(mem, newval, oldval) \ + atomic_compare_and_exchange_bool_rel (mem, newval, oldval) +# endif #endif -#ifndef catomic_compare_and_exchange_bool_rel -# define catomic_compare_and_exchange_bool_rel(mem, newval, oldval) \ - catomic_compare_and_exchange_bool_acq (mem, newval, oldval) +#ifndef atomic_compare_and_exchange_bool_rel +# define atomic_compare_and_exchange_bool_rel(mem, newval, oldval) \ + atomic_compare_and_exchange_bool_acq (mem, newval, oldval) #endif diff --git a/include/inline-hashtab.h b/include/inline-hashtab.h index c359161c54..0f6719b91c 100644 --- a/include/inline-hashtab.h +++ b/include/inline-hashtab.h @@ -1,7 +1,5 @@ /* Fully-inline hash table, used mainly for managing TLS descriptors. - - Copyright (C) 1999, 2000, 2001, 2002, 2003, 2005, 2008 - Free Software Foundation, Inc. + Copyright (C) 1999-2003, 2005, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Alexandre Oliva <aoliva@redhat.com> @@ -30,69 +28,6 @@ extern void weak_function free (void *ptr); -inline static unsigned long -higher_prime_number (unsigned long n) -{ - /* These are primes that are near, but slightly smaller than, a - power of two. */ - static const uint32_t primes[] = { - UINT32_C (7), - UINT32_C (13), - UINT32_C (31), - UINT32_C (61), - UINT32_C (127), - UINT32_C (251), - UINT32_C (509), - UINT32_C (1021), - UINT32_C (2039), - UINT32_C (4093), - UINT32_C (8191), - UINT32_C (16381), - UINT32_C (32749), - UINT32_C (65521), - UINT32_C (131071), - UINT32_C (262139), - UINT32_C (524287), - UINT32_C (1048573), - UINT32_C (2097143), - UINT32_C (4194301), - UINT32_C (8388593), - UINT32_C (16777213), - UINT32_C (33554393), - UINT32_C (67108859), - UINT32_C (134217689), - UINT32_C (268435399), - UINT32_C (536870909), - UINT32_C (1073741789), - UINT32_C (2147483647), - /* 4294967291L */ - UINT32_C (2147483647) + UINT32_C (2147483644) - }; - - const uint32_t *low = &primes[0]; - const uint32_t *high = &primes[sizeof (primes) / sizeof (primes[0])]; - - while (low != high) - { - const uint32_t *mid = low + (high - low) / 2; - if (n > *mid) - low = mid + 1; - else - high = mid; - } - -#if 0 - /* If we've run out of primes, abort. */ - if (n > *low) - { - fprintf (stderr, "Cannot find prime bigger than %lu\n", n); - abort (); - } -#endif - - return *low; -} - struct hashtab { /* Table itself. */ @@ -203,12 +138,11 @@ htab_expand (struct hashtab *htab, int (*hash_fn) (void *)) /* Resize only when table after removal of unused elements is either too full or too empty. */ if (htab->n_elements * 2 > htab->size) - nsize = higher_prime_number (htab->n_elements * 2); + nsize = _dl_higher_prime_number (htab->n_elements * 2); else nsize = htab->size; - nentries = malloc (sizeof (void *) * nsize); - memset (nentries, 0, sizeof (void *) * nsize); + nentries = calloc (sizeof (void *), nsize); if (nentries == NULL) return 0; htab->entries = nentries; diff --git a/malloc/malloc.c b/malloc/malloc.c index 516d401991..a459a2b89d 100644 --- a/malloc/malloc.c +++ b/malloc/malloc.c @@ -4799,8 +4799,29 @@ _int_free(mstate av, mchunkptr p) || __builtin_expect (chunksize (chunk_at_offset (p, size)) >= av->system_mem, 0)) { - errstr = "free(): invalid next size (fast)"; - goto errout; +#ifdef ATOMIC_FASTBINS + /* We might not have a lock at this point and concurrent modifications + of system_mem might have let to a false positive. Redo the test + after getting the lock. */ + if (have_lock + || ({ assert (locked == 0); + mutex_lock(&av->mutex); + locked = 1; + chunk_at_offset (p, size)->size <= 2 * SIZE_SZ + || chunksize (chunk_at_offset (p, size)) >= av->system_mem; + })) +#endif + { + errstr = "free(): invalid next size (fast)"; + goto errout; + } +#ifdef ATOMIC_FASTBINS + if (! have_lock) + { + (void)mutex_unlock(&av->mutex); + locked = 0; + } +#endif } if (__builtin_expect (perturb_byte, 0)) @@ -4823,7 +4844,7 @@ _int_free(mstate av, mchunkptr p) } p->fd = fd = old; } - while ((old = catomic_compare_and_exchange_val_acq (fb, p, fd)) != fd); + while ((old = catomic_compare_and_exchange_val_rel (fb, p, fd)) != fd); #else /* Another simple check: make sure the top of the bin is not the record we are going to add (i.e., double free). */ diff --git a/nptl/ChangeLog b/nptl/ChangeLog index 3eb85d9dea..3eded66512 100644 --- a/nptl/ChangeLog +++ b/nptl/ChangeLog @@ -1,3 +1,63 @@ +2009-07-19 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/unix/sysv/linux/x86_64/lowlevellock.h: Define + FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S: If mutex + is a PI mutex, then use FUTEX_CMP_REQUEUE_PI. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S: Likewise. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S: If mutex + is a PI mutex, then use FUTEX_WAIT_REQUEUE_PI. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S: Likewise. + + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S + (__pthread_cond_timedwait): Make more robust. + +2009-07-18 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S + (__lll_robust_timedlock_wait): If possible use FUTEX_WAIT_BITSET to + directly use absolute timeout. + + * tst-sem5.c (do_test): Add test for premature timeout. + * Makefile: Linu tst-sem5 with librt. + + * sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S + (pthread_rwlock_timedwrlock): If possible use FUTEX_WAIT_BITSET to + directly use absolute timeout. + * sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S + (pthread_rwlock_timedrdlock): Likewise. + + * tst-cond11.c (run_test): Add test to check that the timeout is + long enough. + + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S + (__pthread_cond_timedwait): If possible use FUTEX_WAIT_BITSET to + directly use absolute timeout. + + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S + (__pthread_cond_wait): Convert to using exception handler instead of + registered unwind buffer. + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S + (__pthread_cond_timedwait): Likewise. + +2009-07-17 Ulrich Drepper <drepper@redhat.com> + + * sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S (sem_timedwait): + If possible use FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME to directly + use absolute timeout. + + * sysdeps/unix/sysv/linux/x86_64/sem_wait.S (sem_wait): Optimize + handling of uncontested semaphore. + + * sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S + (__condvar_cleanup): Rewrite to use cfi directives instead of + hand-coded unwind tables. + * sysdeps/unix/sysv/linux/x86_64/pthread_once.S (__pthread_once): + Likewise. + * sysdeps/unix/sysv/linux/x86_64/sem_wait.S (sem_wait): Likewise. + * sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S (sem_timedwait): + Likewise. + 2009-06-12 Ulrich Drepper <drepper@redhat.com> * Makefile (libpthread-routines): Add pthread_sigqueue. diff --git a/nptl/Makefile b/nptl/Makefile index c99aec8ae0..e8180021ec 100644 --- a/nptl/Makefile +++ b/nptl/Makefile @@ -480,6 +480,7 @@ $(objpfx)tst-fini1: $(shared-thread-library) $(objpfx)tst-fini1mod.so ifeq (yes,$(build-shared)) $(objpfx)tst-cond11: $(common-objpfx)rt/librt.so $(objpfx)tst-cond19: $(common-objpfx)rt/librt.so +$(objpfx)tst-sem5: $(common-objpfx)rt/librt.so $(objpfx)tst-cancel17: $(common-objpfx)rt/librt.so $(objpfx)tst-cancelx17: $(common-objpfx)rt/librt.so $(objpfx)tst-cancel18: $(common-objpfx)rt/librt.so @@ -493,6 +494,7 @@ $(objpfx)tst-_res1: $(objpfx)tst-_res1mod2.so $(shared-thread-library) else $(objpfx)tst-cond11: $(common-objpfx)rt/librt.a $(objpfx)tst-cond19: $(common-objpfx)rt/librt.a +$(objpfx)tst-sem5: $(common-objpfx)rt/librt.a $(objpfx)tst-cancel17: $(common-objpfx)rt/librt.a $(objpfx)tst-cancelx17: $(common-objpfx)rt/librt.a $(objpfx)tst-cancel18: $(common-objpfx)rt/librt.a diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h b/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h index 0b7e3bbaba..9b15bfbc57 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h @@ -54,6 +54,8 @@ #define FUTEX_TRYLOCK_PI 8 #define FUTEX_WAIT_BITSET 9 #define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S b/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S index fa7516ef71..02db0a4f9d 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/lowlevelrobustlock.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 +/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -32,6 +32,8 @@ #ifdef __ASSUME_PRIVATE_FUTEX # define LOAD_FUTEX_WAIT(reg) \ xorl $(FUTEX_WAIT | FUTEX_PRIVATE_FLAG), reg +# define LOAD_FUTEX_WAIT_ABS(reg) \ + xorl $(FUTEX_WAIT_BITSET | FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME), reg #else # if FUTEX_WAIT == 0 # define LOAD_FUTEX_WAIT(reg) \ @@ -43,6 +45,10 @@ andl %fs:PRIVATE_FUTEX, reg ; \ orl $FUTEX_WAIT, reg # endif +# define LOAD_FUTEX_WAIT_ABS(reg) \ + xorl $FUTEX_PRIVATE_FLAG, reg ; \ + andl %fs:PRIVATE_FUTEX, reg ; \ + orl $FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME, reg #endif /* For the calculation see asm/vsyscall.h. */ @@ -110,6 +116,73 @@ __lll_robust_lock_wait: .align 16 __lll_robust_timedlock_wait: cfi_startproc +# ifndef __ASSUME_FUTEX_CLOCK_REALTIME +# ifdef PIC + cmpl $0, __have_futex_clock_realtime(%rip) +# else + cmpl $0, __have_futex_clock_realtime +# endif + je .Lreltmo +# endif + + pushq %r9 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r9, 0) + movq %rdx, %r10 + movl $0xffffffff, %r9d + LOAD_FUTEX_WAIT_ABS (%esi) + +1: testl $FUTEX_OWNER_DIED, %eax + jnz 3f + + movl %eax, %edx + orl $FUTEX_WAITERS, %edx + + cmpl %eax, %edx + je 5f + + LOCK + cmpxchgl %edx, (%rdi) + movq $0, %rcx /* Must use mov to avoid changing cc. */ + jnz 6f + +5: movl $SYS_futex, %eax + syscall + movl %eax, %ecx + + movl (%rdi), %eax + +6: testl %eax, %eax + jne 2f + + movl %fs:TID, %edx + orl $FUTEX_WAITERS, %edx + LOCK + cmpxchgl %edx, (%rdi) + jnz 2f + +3: popq %r9 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r9) + retq + + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r9, 0) + /* Check whether the time expired. */ +2: cmpl $-ETIMEDOUT, %ecx + je 4f + cmpl $-EINVAL, %ecx + jne 1b + +4: movl %ecx, %eax + negl %eax + jmp 3b + cfi_adjust_cfa_offset(-8) + cfi_restore(%r9) + + +# ifndef __ASSUME_FUTEX_CLOCK_REALTIME +.Lreltmo: /* Check for a valid timeout value. */ cmpq $1000000000, 8(%rdx) jae 3f @@ -223,10 +296,11 @@ __lll_robust_timedlock_wait: cfi_offset(%r12, -32) cfi_offset(%r13, -40) /* Check whether the time expired. */ -7: cmpq $-ETIMEDOUT, %rcx +7: cmpl $-ETIMEDOUT, %ecx jne 1b 8: movl $ETIMEDOUT, %eax jmp 6b +#endif cfi_endproc .size __lll_robust_timedlock_wait,.-__lll_robust_timedlock_wait diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S index 6155255eb0..0f10ec910c 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_broadcast.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 +/* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -70,12 +70,14 @@ __pthread_cond_broadcast: 8: cmpq $-1, %r8 je 9f - /* XXX: The kernel so far doesn't support requeue to PI futex. */ - /* XXX: The kernel only supports FUTEX_CMP_REQUEUE to the same - type of futex (private resp. shared). */ - testl $(PI_BIT | PS_BIT), MUTEX_KIND(%r8) + /* Do not use requeue for pshared condvars. */ + testl $PS_BIT, MUTEX_KIND(%r8) jne 9f + /* Requeue to a PI mutex if the PI bit is set. */ + testl $PI_BIT, MUTEX_KIND(%r8) + jne 81f + /* Wake up all threads. */ #ifdef __ASSUME_PRIVATE_FUTEX movl $(FUTEX_CMP_REQUEUE|FUTEX_PRIVATE_FLAG), %esi @@ -97,6 +99,20 @@ __pthread_cond_broadcast: 10: xorl %eax, %eax retq + /* Wake up all threads. */ +81: movl $(FUTEX_CMP_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi + movl $SYS_futex, %eax + movl $1, %edx + movl $0x7fffffff, %r10d + syscall + + /* For any kind of error, which mainly is EAGAIN, we try again + with WAKE. The general test also covers running on old + kernels. */ + cmpq $-4095, %rax + jb 10b + jmp 9f + .align 16 /* Unlock. */ 4: LOCK diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S index 8f65f2cd69..f1050fea7c 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_signal.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002-2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -21,6 +21,7 @@ #include <shlib-compat.h> #include <lowlevellock.h> #include <lowlevelcond.h> +#include <pthread-pi-defines.h> #include <kernel-features.h> #include <pthread-errnos.h> @@ -56,19 +57,23 @@ __pthread_cond_signal: /* Wake up one thread. */ cmpq $-1, dep_mutex(%r8) + movl $FUTEX_WAKE_OP, %esi movl $1, %edx + movl $SYS_futex, %eax + je 8f + + /* Get the address of the mutex used. */ + movq dep_mutex(%r8), %rcx + testl $PI_BIT, MUTEX_KIND(%rcx) + jne 9f + #ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAKE_OP, %eax movl $(FUTEX_WAKE_OP|FUTEX_PRIVATE_FLAG), %esi - cmove %eax, %esi #else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi - orl $FUTEX_WAKE_OP, %esi + orl %fs:PRIVATE_FUTEX, %esi #endif - movl $1, %r10d - movl $SYS_futex, %eax + +8: movl $1, %r10d #if cond_lock != 0 addq $cond_lock, %r8 #endif @@ -85,9 +90,27 @@ __pthread_cond_signal: xorl %eax, %eax retq -7: /* %esi should be either FUTEX_WAKE_OP or - FUTEX_WAKE_OP|FUTEX_PRIVATE_FLAG from the previous syscall. */ - xorl $(FUTEX_WAKE ^ FUTEX_WAKE_OP), %esi + /* Wake up one thread and requeue none in the PI Mutex case. */ +9: movl $(FUTEX_CMP_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi + movq %rcx, %r8 + xorq %r10, %r10 + movl (%rdi), %r9d // XXX Can this be right? + syscall + + leaq -cond_futex(%rdi), %r8 + + /* For any kind of error, we try again with WAKE. + The general test also covers running on old kernels. */ + cmpq $-4095, %rax + jb 4f + +7: +#ifdef __ASSUME_PRIVATE_FUTEX + andl $FUTEX_PRIVATE_FLAG, %esi +#else + andl %fs:PRIVATE_FUTEX, %esi +#endif + orl $FUTEX_WAKE, %esi movl $SYS_futex, %eax /* %rdx should be 1 already from $FUTEX_WAKE_OP syscall. movl $1, %edx */ diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S index ddcf106a6d..f81466e1a5 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_timedwait.S @@ -21,6 +21,7 @@ #include <shlib-compat.h> #include <lowlevellock.h> #include <lowlevelcond.h> +#include <pthread-pi-defines.h> #include <pthread-errnos.h> #include <kernel-features.h> @@ -31,13 +32,24 @@ .text + /* int pthread_cond_timedwait (pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *abstime) */ .globl __pthread_cond_timedwait .type __pthread_cond_timedwait, @function .align 16 __pthread_cond_timedwait: +.LSTARTCODE: cfi_startproc +#ifdef SHARED + cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect, + DW.ref.__gcc_personality_v0) + cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART) +#else + cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) + cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) +#endif + pushq %r12 cfi_adjust_cfa_offset(8) cfi_rel_offset(%r12, 0) @@ -47,23 +59,28 @@ __pthread_cond_timedwait: pushq %r14 cfi_adjust_cfa_offset(8) cfi_rel_offset(%r14, 0) -#define FRAME_SIZE 80 + pushq %r15 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r15, 0) +#ifdef __ASSUME_FUTEX_CLOCK_REALTIME +# define FRAME_SIZE 32 +#else +# define FRAME_SIZE 48 +#endif subq $FRAME_SIZE, %rsp cfi_adjust_cfa_offset(FRAME_SIZE) cmpq $1000000000, 8(%rdx) movl $EINVAL, %eax - jae 18f + jae 48f /* Stack frame: - rsp + 80 - +--------------------------+ - rsp + 48 | cleanup buffer | + rsp + 48 +--------------------------+ - rsp + 40 | old wake_seq value | + rsp + 32 | timeout value | +--------------------------+ - rsp + 24 | timeout value | + rsp + 24 | old wake_seq value | +--------------------------+ rsp + 16 | mutex pointer | +--------------------------+ @@ -85,8 +102,18 @@ __pthread_cond_timedwait: je 22f movq %rsi, dep_mutex(%rdi) +22: +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +# ifdef PIC + cmpl $0, __have_futex_clock_realtime(%rip) +# else + cmpl $0, __have_futex_clock_realtime +# endif + je .Lreltmo +#endif + /* Get internal lock. */ -22: movl $1, %esi + movl $1, %esi xorl %eax, %eax LOCK #if cond_lock == 0 @@ -94,98 +121,29 @@ __pthread_cond_timedwait: #else cmpxchgl %esi, cond_lock(%rdi) #endif - jnz 1f + jnz 31f /* Unlock the mutex. */ -2: movq 16(%rsp), %rdi +32: movq 16(%rsp), %rdi xorl %esi, %esi callq __pthread_mutex_unlock_usercnt testl %eax, %eax - jne 16f + jne 46f movq 8(%rsp), %rdi incq total_seq(%rdi) incl cond_futex(%rdi) addl $(1 << nwaiters_shift), cond_nwaiters(%rdi) - /* Install cancellation handler. */ -#ifdef PIC - leaq __condvar_cleanup(%rip), %rsi -#else - leaq __condvar_cleanup, %rsi -#endif - leaq 48(%rsp), %rdi - movq %rsp, %rdx - callq __pthread_cleanup_push - /* Get and store current wakeup_seq value. */ movq 8(%rsp), %rdi movq wakeup_seq(%rdi), %r9 movl broadcast_seq(%rdi), %edx - movq %r9, 40(%rsp) + movq %r9, 24(%rsp) movl %edx, 4(%rsp) - /* Get the current time. */ -8: -#ifdef __NR_clock_gettime - /* Get the clock number. Note that the field in the condvar - structure stores the number minus 1. */ - movq 8(%rsp), %rdi - movl cond_nwaiters(%rdi), %edi - andl $((1 << nwaiters_shift) - 1), %edi - /* Only clocks 0 and 1 are allowed so far. Both are handled in the - kernel. */ - leaq 24(%rsp), %rsi -# ifdef SHARED - movq __vdso_clock_gettime@GOTPCREL(%rip), %rax - movq (%rax), %rax - PTR_DEMANGLE (%rax) - jz 26f - call *%rax - jmp 27f -# endif -26: movl $__NR_clock_gettime, %eax - syscall -27: -# ifndef __ASSUME_POSIX_TIMERS - cmpq $-ENOSYS, %rax - je 19f -# endif - - /* Compute relative timeout. */ - movq (%r13), %rcx - movq 8(%r13), %rdx - subq 24(%rsp), %rcx - subq 32(%rsp), %rdx -#else - leaq 24(%rsp), %rdi - xorl %esi, %esi - movq $VSYSCALL_ADDR_vgettimeofday, %rax - callq *%rax - - /* Compute relative timeout. */ - movq 32(%rsp), %rax - movl $1000, %edx - mul %rdx /* Milli seconds to nano seconds. */ - movq (%r13), %rcx - movq 8(%r13), %rdx - subq 24(%rsp), %rcx - subq %rax, %rdx -#endif - jns 12f - addq $1000000000, %rdx - decq %rcx -12: testq %rcx, %rcx - movq 8(%rsp), %rdi - movq $-ETIMEDOUT, %r14 - js 6f - - /* Store relative timeout. */ -21: movq %rcx, 24(%rsp) - movq %rdx, 32(%rsp) - - movl cond_futex(%rdi), %r12d +38: movl cond_futex(%rdi), %r12d /* Unlock. */ LOCK @@ -194,33 +152,67 @@ __pthread_cond_timedwait: #else decl cond_lock(%rdi) #endif - jne 3f + jne 33f -4: callq __pthread_enable_asynccancel +.LcleanupSTART1: +34: callq __pthread_enable_asynccancel movl %eax, (%rsp) + movq 8(%rsp), %rdi - leaq 24(%rsp), %r10 + movq %r13, %r10 cmpq $-1, dep_mutex(%rdi) - movq %r12, %rdx -#ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAIT, %eax - movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi + movl $FUTEX_WAIT_BITSET, %eax + movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi cmove %eax, %esi + je 60f + + movq dep_mutex(%rdi), %r8 + /* Requeue to a PI mutex if the PI bit is set. */ + testl $PI_BIT, MUTEX_KIND(%r8) + je 60f + + movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi + xorl %eax, %eax + /* The following only works like this because we only support + two clocks, represented using a single bit. */ + testl $1, cond_nwaiters(%rdi) + movl $FUTEX_CLOCK_REALTIME, %edx + cmove %edx, %eax + orl %eax, %esi + movq %r12, %rdx + addq $cond_futex, %rdi + movl $SYS_futex, %eax + syscall + + movl $1, %r15d +#ifdef __ASSUME_REQUEUE_PI + jmp 62f #else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi -# if FUTEX_WAIT != 0 - orl $FUTEX_WAIT, %esi -# endif + cmpq $-4095, %rax + jnae 62f + + movl $(FUTEX_WAIT_BITSET|FUTEX_PRIVATE_FLAG), %esi + subq $cond_futex, %rdi #endif + +60: xorl %r15d, %r15d + xorl %eax, %eax + /* The following only works like this because we only support + two clocks, represented using a single bit. */ + testl $1, cond_nwaiters(%rdi) + movl $FUTEX_CLOCK_REALTIME, %edx + movl $0xffffffff, %r9d + cmove %edx, %eax + orl %eax, %esi + movq %r12, %rdx addq $cond_futex, %rdi movl $SYS_futex, %eax syscall - movq %rax, %r14 +62: movq %rax, %r14 movl (%rsp), %edi callq __pthread_disable_asynccancel +.LcleanupEND1: /* Lock. */ movq 8(%rsp), %rdi @@ -232,45 +224,45 @@ __pthread_cond_timedwait: #else cmpxchgl %esi, cond_lock(%rdi) #endif - jne 5f + jne 35f -6: movl broadcast_seq(%rdi), %edx +36: movl broadcast_seq(%rdi), %edx movq woken_seq(%rdi), %rax movq wakeup_seq(%rdi), %r9 cmpl 4(%rsp), %edx - jne 23f + jne 53f - cmpq 40(%rsp), %r9 - jbe 15f + cmpq 24(%rsp), %r9 + jbe 45f cmpq %rax, %r9 - ja 9f + ja 39f -15: cmpq $-ETIMEDOUT, %r14 - jne 8b +45: cmpq $-ETIMEDOUT, %r14 + jne 38b -13: incq wakeup_seq(%rdi) +99: incq wakeup_seq(%rdi) incl cond_futex(%rdi) movl $ETIMEDOUT, %r14d - jmp 14f + jmp 44f -23: xorq %r14, %r14 - jmp 24f +53: xorq %r14, %r14 + jmp 54f -9: xorq %r14, %r14 -14: incq woken_seq(%rdi) +39: xorq %r14, %r14 +44: incq woken_seq(%rdi) -24: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) +54: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) /* Wake up a thread which wants to destroy the condvar object. */ cmpq $0xffffffffffffffff, total_seq(%rdi) - jne 25f + jne 55f movl cond_nwaiters(%rdi), %eax andl $~((1 << nwaiters_shift) - 1), %eax - jne 25f + jne 55f addq $cond_nwaiters, %rdi cmpq $-1, dep_mutex-cond_nwaiters(%rdi) @@ -289,26 +281,31 @@ __pthread_cond_timedwait: syscall subq $cond_nwaiters, %rdi -25: LOCK +55: LOCK #if cond_lock == 0 decl (%rdi) #else decl cond_lock(%rdi) #endif - jne 10f + jne 40f - /* Remove cancellation handler. */ -11: movq 48+CLEANUP_PREV(%rsp), %rdx - movq %rdx, %fs:CLEANUP + /* If requeue_pi is used the kernel performs the locking of the + mutex. */ +41: xorl %eax, %eax + testl %r15d, %r15d + jnz 63f movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock - testq %rax, %rax +63: testq %rax, %rax cmoveq %r14, %rax -18: addq $FRAME_SIZE, %rsp +48: addq $FRAME_SIZE, %rsp cfi_adjust_cfa_offset(-FRAME_SIZE) + popq %r15 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r15) popq %r14 cfi_adjust_cfa_offset(-8) cfi_restore(%r14) @@ -322,11 +319,11 @@ __pthread_cond_timedwait: retq /* Initial locking failed. */ -1: - cfi_adjust_cfa_offset(3 * 8 + FRAME_SIZE) - cfi_rel_offset(%r12, FRAME_SIZE + 16) - cfi_rel_offset(%r13, FRAME_SIZE + 8) - cfi_rel_offset(%r14, FRAME_SIZE) +31: cfi_adjust_cfa_offset(4 * 8 + FRAME_SIZE) + cfi_rel_offset(%r12, FRAME_SIZE + 24) + cfi_rel_offset(%r13, FRAME_SIZE + 16) + cfi_rel_offset(%r14, FRAME_SIZE + 8) + cfi_rel_offset(%r15, FRAME_SIZE) #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -335,10 +332,10 @@ __pthread_cond_timedwait: movl $LLL_SHARED, %esi cmovne %eax, %esi callq __lll_lock_wait - jmp 2b + jmp 32b /* Unlock in loop requires wakeup. */ -3: +33: #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -347,10 +344,10 @@ __pthread_cond_timedwait: movl $LLL_SHARED, %esi cmovne %eax, %esi callq __lll_unlock_wake - jmp 4b + jmp 34b /* Locking in loop failed. */ -5: +35: #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -362,10 +359,10 @@ __pthread_cond_timedwait: #if cond_lock != 0 subq $cond_lock, %rdi #endif - jmp 6b + jmp 36b /* Unlock after loop requires wakeup. */ -10: +40: #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -374,10 +371,10 @@ __pthread_cond_timedwait: movl $LLL_SHARED, %esi cmovne %eax, %esi callq __lll_unlock_wake - jmp 11b + jmp 41b /* The initial unlocking of the mutex failed. */ -16: movq 8(%rsp), %rdi +46: movq 8(%rsp), %rdi movq %rax, (%rsp) LOCK #if cond_lock == 0 @@ -385,7 +382,7 @@ __pthread_cond_timedwait: #else decl cond_lock(%rdi) #endif - jne 17f + jne 47f #if cond_lock != 0 addq $cond_lock, %rdi @@ -396,23 +393,229 @@ __pthread_cond_timedwait: cmovne %eax, %esi callq __lll_unlock_wake -17: movq (%rsp), %rax - jmp 18b +47: movq (%rsp), %rax + jmp 48b + + +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +.Lreltmo: + xorl %r15d, %r15d + + /* Get internal lock. */ + movl $1, %esi + xorl %eax, %eax + LOCK +# if cond_lock == 0 + cmpxchgl %esi, (%rdi) +# else + cmpxchgl %esi, cond_lock(%rdi) +# endif + jnz 1f + + /* Unlock the mutex. */ +2: movq 16(%rsp), %rdi + xorl %esi, %esi + callq __pthread_mutex_unlock_usercnt + + testl %eax, %eax + jne 46b + + movq 8(%rsp), %rdi + incq total_seq(%rdi) + incl cond_futex(%rdi) + addl $(1 << nwaiters_shift), cond_nwaiters(%rdi) + + /* Get and store current wakeup_seq value. */ + movq 8(%rsp), %rdi + movq wakeup_seq(%rdi), %r9 + movl broadcast_seq(%rdi), %edx + movq %r9, 24(%rsp) + movl %edx, 4(%rsp) + + /* Get the current time. */ +8: +# ifdef __NR_clock_gettime + /* Get the clock number. Note that the field in the condvar + structure stores the number minus 1. */ + movq 8(%rsp), %rdi + movl cond_nwaiters(%rdi), %edi + andl $((1 << nwaiters_shift) - 1), %edi + /* Only clocks 0 and 1 are allowed so far. Both are handled in the + kernel. */ + leaq 32(%rsp), %rsi +# ifdef SHARED + movq __vdso_clock_gettime@GOTPCREL(%rip), %rax + movq (%rax), %rax + PTR_DEMANGLE (%rax) + jz 26f + call *%rax + jmp 27f +# endif +26: movl $__NR_clock_gettime, %eax + syscall +27: +# ifndef __ASSUME_POSIX_TIMERS + cmpq $-ENOSYS, %rax + je 19f +# endif + + /* Compute relative timeout. */ + movq (%r13), %rcx + movq 8(%r13), %rdx + subq 32(%rsp), %rcx + subq 40(%rsp), %rdx +# else + leaq 24(%rsp), %rdi + xorl %esi, %esi + movq $VSYSCALL_ADDR_vgettimeofday, %rax + callq *%rax + + /* Compute relative timeout. */ + movq 40(%rsp), %rax + movl $1000, %edx + mul %rdx /* Milli seconds to nano seconds. */ + movq (%r13), %rcx + movq 8(%r13), %rdx + subq 32(%rsp), %rcx + subq %rax, %rdx +# endif + jns 12f + addq $1000000000, %rdx + decq %rcx +12: testq %rcx, %rcx + movq 8(%rsp), %rdi + movq $-ETIMEDOUT, %r14 + js 6f + + /* Store relative timeout. */ +21: movq %rcx, 32(%rsp) + movq %rdx, 40(%rsp) + + movl cond_futex(%rdi), %r12d + + /* Unlock. */ + LOCK +# if cond_lock == 0 + decl (%rdi) +# else + decl cond_lock(%rdi) +# endif + jne 3f + +.LcleanupSTART2: +4: callq __pthread_enable_asynccancel + movl %eax, (%rsp) + movq 8(%rsp), %rdi + + leaq 32(%rsp), %r10 + cmpq $-1, dep_mutex(%rdi) + movq %r12, %rdx +# ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAIT, %eax + movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +# else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi +# if FUTEX_WAIT != 0 + orl $FUTEX_WAIT, %esi +# endif +# endif + addq $cond_futex, %rdi + movl $SYS_futex, %eax + syscall + movq %rax, %r14 + + movl (%rsp), %edi + callq __pthread_disable_asynccancel +.LcleanupEND2: + + /* Lock. */ + movq 8(%rsp), %rdi + movl $1, %esi + xorl %eax, %eax + LOCK +# if cond_lock == 0 + cmpxchgl %esi, (%rdi) +# else + cmpxchgl %esi, cond_lock(%rdi) +# endif + jne 5f + +6: movl broadcast_seq(%rdi), %edx + + movq woken_seq(%rdi), %rax + + movq wakeup_seq(%rdi), %r9 + + cmpl 4(%rsp), %edx + jne 53b + + cmpq 24(%rsp), %r9 + jbe 45b + + cmpq %rax, %r9 + ja 39b + + cmpq $-ETIMEDOUT, %r14 + jne 8b + + jmp 99b + + /* Initial locking failed. */ +1: +# if cond_lock != 0 + addq $cond_lock, %rdi +# endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_lock_wait + jmp 2b + + /* Unlock in loop requires wakeup. */ +3: +# if cond_lock != 0 + addq $cond_lock, %rdi +# endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_unlock_wake + jmp 4b + + /* Locking in loop failed. */ +5: +# if cond_lock != 0 + addq $cond_lock, %rdi +# endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_lock_wait +# if cond_lock != 0 + subq $cond_lock, %rdi +# endif + jmp 6b -#if defined __NR_clock_gettime && !defined __ASSUME_POSIX_TIMERS +# if defined __NR_clock_gettime && !defined __ASSUME_POSIX_TIMERS /* clock_gettime not available. */ -19: leaq 24(%rsp), %rdi +19: leaq 32(%rsp), %rdi xorl %esi, %esi movq $VSYSCALL_ADDR_vgettimeofday, %rax callq *%rax /* Compute relative timeout. */ - movq 32(%rsp), %rax + movq 40(%rsp), %rax movl $1000, %edx mul %rdx /* Milli seconds to nano seconds. */ movq (%r13), %rcx movq 8(%r13), %rdx - subq 24(%rsp), %rcx + subq 32(%rsp), %rcx subq %rax, %rdx jns 20f addq $1000000000, %rdx @@ -422,8 +625,188 @@ __pthread_cond_timedwait: movq $-ETIMEDOUT, %r14 js 6b jmp 21b +# endif #endif - cfi_endproc .size __pthread_cond_timedwait, .-__pthread_cond_timedwait versioned_symbol (libpthread, __pthread_cond_timedwait, pthread_cond_timedwait, GLIBC_2_3_2) + + + .align 16 + .type __condvar_cleanup2, @function +__condvar_cleanup2: + /* Stack frame: + + rsp + 72 + +--------------------------+ + rsp + 64 | %r12 | + +--------------------------+ + rsp + 56 | %r13 | + +--------------------------+ + rsp + 48 | %r14 | + +--------------------------+ + rsp + 24 | unused | + +--------------------------+ + rsp + 16 | mutex pointer | + +--------------------------+ + rsp + 8 | condvar pointer | + +--------------------------+ + rsp + 4 | old broadcast_seq value | + +--------------------------+ + rsp + 0 | old cancellation mode | + +--------------------------+ + */ + + movq %rax, 24(%rsp) + + /* Get internal lock. */ + movq 8(%rsp), %rdi + movl $1, %esi + xorl %eax, %eax + LOCK +#if cond_lock == 0 + cmpxchgl %esi, (%rdi) +#else + cmpxchgl %esi, cond_lock(%rdi) +#endif + jz 1f + +#if cond_lock != 0 + addq $cond_lock, %rdi +#endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_lock_wait +#if cond_lock != 0 + subq $cond_lock, %rdi +#endif + +1: movl broadcast_seq(%rdi), %edx + cmpl 4(%rsp), %edx + jne 3f + + /* We increment the wakeup_seq counter only if it is lower than + total_seq. If this is not the case the thread was woken and + then canceled. In this case we ignore the signal. */ + movq total_seq(%rdi), %rax + cmpq wakeup_seq(%rdi), %rax + jbe 6f + incq wakeup_seq(%rdi) + incl cond_futex(%rdi) +6: incq woken_seq(%rdi) + +3: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) + + /* Wake up a thread which wants to destroy the condvar object. */ + xorq %r12, %r12 + cmpq $0xffffffffffffffff, total_seq(%rdi) + jne 4f + movl cond_nwaiters(%rdi), %eax + andl $~((1 << nwaiters_shift) - 1), %eax + jne 4f + + cmpq $-1, dep_mutex(%rdi) + leaq cond_nwaiters(%rdi), %rdi + movl $1, %edx +#ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAKE, %eax + movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +#else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi + orl $FUTEX_WAKE, %esi +#endif + movl $SYS_futex, %eax + syscall + subq $cond_nwaiters, %rdi + movl $1, %r12d + +4: LOCK +#if cond_lock == 0 + decl (%rdi) +#else + decl cond_lock(%rdi) +#endif + je 2f +#if cond_lock != 0 + addq $cond_lock, %rdi +#endif + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_unlock_wake + + /* Wake up all waiters to make sure no signal gets lost. */ +2: testq %r12, %r12 + jnz 5f + addq $cond_futex, %rdi + cmpq $-1, dep_mutex-cond_futex(%rdi) + movl $0x7fffffff, %edx +#ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAKE, %eax + movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +#else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi + orl $FUTEX_WAKE, %esi +#endif + movl $SYS_futex, %eax + syscall + +5: movq 16(%rsp), %rdi + callq __pthread_mutex_cond_lock + + movq 24(%rsp), %rdi + movq FRAME_SIZE(%rsp), %r15 + movq FRAME_SIZE+8(%rsp), %r14 + movq FRAME_SIZE+16(%rsp), %r13 + movq FRAME_SIZE+24(%rsp), %r12 +.LcallUR: + call _Unwind_Resume@PLT + hlt +.LENDCODE: + cfi_endproc + .size __condvar_cleanup2, .-__condvar_cleanup2 + + + .section .gcc_except_table,"a",@progbits +.LexceptSTART: + .byte DW_EH_PE_omit # @LPStart format + .byte DW_EH_PE_omit # @TType format + .byte DW_EH_PE_uleb128 # call-site format + .uleb128 .Lcstend-.Lcstbegin +.Lcstbegin: + .uleb128 .LcleanupSTART1-.LSTARTCODE + .uleb128 .LcleanupEND1-.LcleanupSTART1 + .uleb128 __condvar_cleanup2-.LSTARTCODE + .uleb128 0 +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + .uleb128 .LcleanupSTART2-.LSTARTCODE + .uleb128 .LcleanupEND2-.LcleanupSTART2 + .uleb128 __condvar_cleanup2-.LSTARTCODE + .uleb128 0 +#endif + .uleb128 .LcallUR-.LSTARTCODE + .uleb128 .LENDCODE-.LcallUR + .uleb128 0 + .uleb128 0 +.Lcstend: + + +#ifdef SHARED + .hidden DW.ref.__gcc_personality_v0 + .weak DW.ref.__gcc_personality_v0 + .section .gnu.linkonce.d.DW.ref.__gcc_personality_v0,"aw",@progbits + .align 8 + .type DW.ref.__gcc_personality_v0, @object + .size DW.ref.__gcc_personality_v0, 8 +DW.ref.__gcc_personality_v0: + .quad __gcc_personality_v0 +#endif diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S index e5e802d531..e6323ea3e2 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002,2003,2004,2005,2006,2007 Free Software Foundation, Inc. +/* Copyright (C) 2002-2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -22,146 +22,42 @@ #include <lowlevellock.h> #include <lowlevelcond.h> #include <tcb-offsets.h> +#include <pthread-pi-defines.h> #include <kernel-features.h> .text - .align 16 - .type __condvar_cleanup, @function - .globl __condvar_cleanup - .hidden __condvar_cleanup -__condvar_cleanup: - pushq %r12 - - /* Get internal lock. */ - movq %rdi, %r8 - movq 8(%rdi), %rdi - movl $1, %esi - xorl %eax, %eax - LOCK -#if cond_lock == 0 - cmpxchgl %esi, (%rdi) -#else - cmpxchgl %esi, cond_lock(%rdi) -#endif - jz 1f - -#if cond_lock != 0 - addq $cond_lock, %rdi -#endif - cmpq $-1, dep_mutex-cond_lock(%rdi) - movl $LLL_PRIVATE, %eax - movl $LLL_SHARED, %esi - cmovne %eax, %esi - callq __lll_lock_wait -#if cond_lock != 0 - subq $cond_lock, %rdi -#endif - -1: movl broadcast_seq(%rdi), %edx - cmpl 4(%r8), %edx - jne 3f - - /* We increment the wakeup_seq counter only if it is lower than - total_seq. If this is not the case the thread was woken and - then canceled. In this case we ignore the signal. */ - movq total_seq(%rdi), %rax - cmpq wakeup_seq(%rdi), %rax - jbe 6f - incq wakeup_seq(%rdi) - incl cond_futex(%rdi) -6: incq woken_seq(%rdi) - -3: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) - - /* Wake up a thread which wants to destroy the condvar object. */ - xorq %r12, %r12 - cmpq $0xffffffffffffffff, total_seq(%rdi) - jne 4f - movl cond_nwaiters(%rdi), %eax - andl $~((1 << nwaiters_shift) - 1), %eax - jne 4f - - addq $cond_nwaiters, %rdi - cmpq $-1, dep_mutex-cond_nwaiters(%rdi) - movl $1, %edx -#ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAKE, %eax - movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi - cmove %eax, %esi -#else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi - orl $FUTEX_WAKE, %esi -#endif - movl $SYS_futex, %eax - syscall - subq $cond_nwaiters, %rdi - movl $1, %r12d - -4: LOCK -#if cond_lock == 0 - decl (%rdi) -#else - decl cond_lock(%rdi) -#endif - je 2f -#if cond_lock != 0 - addq $cond_lock, %rdi -#endif - cmpq $-1, dep_mutex-cond_lock(%rdi) - movl $LLL_PRIVATE, %eax - movl $LLL_SHARED, %esi - cmovne %eax, %esi - callq __lll_unlock_wake - - /* Wake up all waiters to make sure no signal gets lost. */ -2: testq %r12, %r12 - jnz 5f - addq $cond_futex, %rdi - cmpq $-1, dep_mutex-cond_futex(%rdi) - movl $0x7fffffff, %edx -#ifdef __ASSUME_PRIVATE_FUTEX - movl $FUTEX_WAKE, %eax - movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi - cmove %eax, %esi -#else - movl $0, %eax - movl %fs:PRIVATE_FUTEX, %esi - cmove %eax, %esi - orl $FUTEX_WAKE, %esi -#endif - movl $SYS_futex, %eax - syscall - -5: movq 16(%r8), %rdi - callq __pthread_mutex_cond_lock - - popq %r12 - - retq - .size __condvar_cleanup, .-__condvar_cleanup - - /* int pthread_cond_wait (pthread_cond_t *cond, pthread_mutex_t *mutex) */ .globl __pthread_cond_wait .type __pthread_cond_wait, @function .align 16 __pthread_cond_wait: .LSTARTCODE: + cfi_startproc +#ifdef SHARED + cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect, + DW.ref.__gcc_personality_v0) + cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART) +#else + cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) + cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) +#endif + pushq %r12 -.Lpush_r12: -#define FRAME_SIZE 64 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) + pushq %r13 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r13, 0) +#define FRAME_SIZE 32 subq $FRAME_SIZE, %rsp -.Lsubq: + cfi_adjust_cfa_offset(FRAME_SIZE) + /* Stack frame: - rsp + 64 - +--------------------------+ - rsp + 32 | cleanup buffer | + rsp + 32 +--------------------------+ rsp + 24 | old wake_seq value | +--------------------------+ @@ -208,16 +104,6 @@ __pthread_cond_wait: incl cond_futex(%rdi) addl $(1 << nwaiters_shift), cond_nwaiters(%rdi) - /* Install cancellation handler. */ -#ifdef PIC - leaq __condvar_cleanup(%rip), %rsi -#else - leaq __condvar_cleanup, %rsi -#endif - leaq 32(%rsp), %rdi - movq %rsp, %rdx - callq __pthread_cleanup_push - /* Get and store current wakeup_seq value. */ movq 8(%rsp), %rdi movq wakeup_seq(%rdi), %r9 @@ -235,31 +121,57 @@ __pthread_cond_wait: #endif jne 3f +.LcleanupSTART: 4: callq __pthread_enable_asynccancel movl %eax, (%rsp) movq 8(%rsp), %rdi xorq %r10, %r10 movq %r12, %rdx - addq $cond_futex-cond_lock, %rdi + // XXX reverse + lea + addq $cond_futex, %rdi cmpq $-1, dep_mutex-cond_futex(%rdi) #ifdef __ASSUME_PRIVATE_FUTEX movl $FUTEX_WAIT, %eax movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi cmove %eax, %esi #else - movl $FUTEX_WAIT, %eax + movl $0, %eax movl %fs:PRIVATE_FUTEX, %esi cmove %eax, %esi # if FUTEX_WAIT != 0 +# error "cc destroyed by following orl" orl $FUTEX_WAIT, %esi # endif #endif + je 60f + + movq dep_mutex-cond_futex(%rdi), %r8 + /* Requeue to a PI mutex if the PI bit is set. */ + testl $PI_BIT, MUTEX_KIND(%r8) + je 60f + + movl $(FUTEX_WAIT_REQUEUE_PI|FUTEX_PRIVATE_FLAG), %esi + movl $SYS_futex, %eax + syscall + + movl $1, %r13d +#ifdef __ASSUME_REQUEUE_PI + jmp 62f +#else + cmpq $-4095, %rax + jnae 62f + + movl $(FUTEX_WAIT|FUTEX_PRIVATE_FLAG), %esi +#endif + +60: xorl %r13d, %r13d movl $SYS_futex, %eax syscall - movl (%rsp), %edi +62: movl (%rsp), %edi callq __pthread_disable_asynccancel +.LcleanupEND: /* Lock. */ movq 8(%rsp), %rdi @@ -325,24 +237,33 @@ __pthread_cond_wait: #endif jne 10f - /* Remove cancellation handler. */ -11: movq 32+CLEANUP_PREV(%rsp), %rdx - movq %rdx, %fs:CLEANUP + /* If requeue_pi is used the kernel performs the locking of the + mutex. */ +11: xorl %eax, %eax + testl %r13d, %r13d + jnz 14f movq 16(%rsp), %rdi callq __pthread_mutex_cond_lock + 14: addq $FRAME_SIZE, %rsp -.Laddq: + cfi_adjust_cfa_offset(-FRAME_SIZE) + popq %r13 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r13) popq %r12 -.Lpop_r12: + cfi_adjust_cfa_offset(-8) + cfi_restore(%r12) /* We return the result of the mutex_lock operation. */ retq /* Initial locking failed. */ 1: -.LSbl1: + cfi_adjust_cfa_offset(16 + FRAME_SIZE) + cfi_rel_offset(%r12, FRAME_SIZE + 8) + cfi_rel_offset(%r13, FRAME_SIZE) #if cond_lock != 0 addq $cond_lock, %rdi #endif @@ -414,75 +335,178 @@ __pthread_cond_wait: 13: movq %r10, %rax jmp 14b -.LENDCODE: .size __pthread_cond_wait, .-__pthread_cond_wait versioned_symbol (libpthread, __pthread_cond_wait, pthread_cond_wait, GLIBC_2_3_2) - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long L(ENDCIE)-L(STARTCIE) # Length of the CIE. -.LSTARTCIE: - .long 0 # CIE ID. - .byte 1 # Version number. -#ifdef SHARED - .string "zR" # NUL-terminated augmentation - # string. + .align 16 + .type __condvar_cleanup1, @function + .globl __condvar_cleanup1 + .hidden __condvar_cleanup1 +__condvar_cleanup1: + /* Stack frame: + + rsp + 48 + +--------------------------+ + rsp + 40 | %r12 | + +--------------------------+ + rsp + 32 | %r13 | + +--------------------------+ + rsp + 24 | unused | + +--------------------------+ + rsp + 16 | mutex pointer | + +--------------------------+ + rsp + 8 | condvar pointer | + +--------------------------+ + rsp + 4 | old broadcast_seq value | + +--------------------------+ + rsp + 0 | old cancellation mode | + +--------------------------+ + */ + + movq %rax, 24(%rsp) + + /* Get internal lock. */ + movq 8(%rsp), %rdi + movl $1, %esi + xorl %eax, %eax + LOCK +#if cond_lock == 0 + cmpxchgl %esi, (%rdi) #else - .ascii "\0" # NUL-terminated augmentation - # string. + cmpxchgl %esi, cond_lock(%rdi) #endif - .uleb128 1 # Code alignment factor. - .sleb128 -8 # Data alignment factor. - .byte 16 # Return address register - # column. -#ifdef SHARED - .uleb128 1 # Augmentation value length. - .byte 0x1b # Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. + jz 1f + +#if cond_lock != 0 + addq $cond_lock, %rdi #endif - .byte 0x0c # DW_CFA_def_cfa - .uleb128 7 - .uleb128 8 - .byte 0x90 # DW_CFA_offset, column 0x8 - .uleb128 1 - .align 8 -.LENDCIE: - - .long .LENDFDE-.LSTARTFDE # Length of the FDE. -.LSTARTFDE: - .long .LSTARTFDE-.LSTARTFRAME # CIE pointer. -#ifdef SHARED - .long .LSTARTCODE-. # PC-relative start address - # of the code + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_lock_wait +#if cond_lock != 0 + subq $cond_lock, %rdi +#endif + +1: movl broadcast_seq(%rdi), %edx + cmpl 4(%rsp), %edx + jne 3f + + /* We increment the wakeup_seq counter only if it is lower than + total_seq. If this is not the case the thread was woken and + then canceled. In this case we ignore the signal. */ + movq total_seq(%rdi), %rax + cmpq wakeup_seq(%rdi), %rax + jbe 6f + incq wakeup_seq(%rdi) + incl cond_futex(%rdi) +6: incq woken_seq(%rdi) + +3: subl $(1 << nwaiters_shift), cond_nwaiters(%rdi) + + /* Wake up a thread which wants to destroy the condvar object. */ + xorq %r12, %r12 + cmpq $0xffffffffffffffff, total_seq(%rdi) + jne 4f + movl cond_nwaiters(%rdi), %eax + andl $~((1 << nwaiters_shift) - 1), %eax + jne 4f + + cmpq $-1, dep_mutex(%rdi) + leaq cond_nwaiters(%rdi), %rdi + movl $1, %edx +#ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAKE, %eax + movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +#else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi + orl $FUTEX_WAKE, %esi +#endif + movl $SYS_futex, %eax + syscall + subq $cond_nwaiters, %rdi + movl $1, %r12d + +4: LOCK +#if cond_lock == 0 + decl (%rdi) #else - .long .LSTARTCODE # Start address of the code. + decl cond_lock(%rdi) #endif - .long .LENDCODE-.LSTARTCODE # Length of the code. -#ifdef SHARED - .uleb128 0 # No augmentation data. + je 2f +#if cond_lock != 0 + addq $cond_lock, %rdi #endif - .byte 0x40+.Lpush_r12-.LSTARTCODE # DW_CFA_advance_loc+N - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .byte 0x40+.Lsubq-.Lpush_r12 # DW_CFA_advance_loc+N - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16+FRAME_SIZE - .byte 3 # DW_CFA_advance_loc2 - .2byte .Laddq-.Lsubq - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0x40+.Lpop_r12-.Laddq # DW_CFA_advance_loc+N - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 8 - .byte 0xcc # DW_CFA_restore %r12 - .byte 0x40+.LSbl1-.Lpop_r12 # DW_CFA_advance_loc+N - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 80 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 + cmpq $-1, dep_mutex-cond_lock(%rdi) + movl $LLL_PRIVATE, %eax + movl $LLL_SHARED, %esi + cmovne %eax, %esi + callq __lll_unlock_wake + + /* Wake up all waiters to make sure no signal gets lost. */ +2: testq %r12, %r12 + jnz 5f + addq $cond_futex, %rdi + cmpq $-1, dep_mutex-cond_futex(%rdi) + movl $0x7fffffff, %edx +#ifdef __ASSUME_PRIVATE_FUTEX + movl $FUTEX_WAKE, %eax + movl $(FUTEX_WAKE|FUTEX_PRIVATE_FLAG), %esi + cmove %eax, %esi +#else + movl $0, %eax + movl %fs:PRIVATE_FUTEX, %esi + cmove %eax, %esi + orl $FUTEX_WAKE, %esi +#endif + movl $SYS_futex, %eax + syscall + +5: movq 16(%rsp), %rdi + callq __pthread_mutex_cond_lock + + movq 24(%rsp), %rdi + movq 40(%rsp), %r12 + movq 32(%rsp), %r13 +.LcallUR: + call _Unwind_Resume@PLT + hlt +.LENDCODE: + cfi_endproc + .size __condvar_cleanup1, .-__condvar_cleanup1 + + + .section .gcc_except_table,"a",@progbits +.LexceptSTART: + .byte DW_EH_PE_omit # @LPStart format + .byte DW_EH_PE_omit # @TType format + .byte DW_EH_PE_uleb128 # call-site format + .uleb128 .Lcstend-.Lcstbegin +.Lcstbegin: + .uleb128 .LcleanupSTART-.LSTARTCODE + .uleb128 .LcleanupEND-.LcleanupSTART + .uleb128 __condvar_cleanup1-.LSTARTCODE + .uleb128 0 + .uleb128 .LcallUR-.LSTARTCODE + .uleb128 .LENDCODE-.LcallUR + .uleb128 0 + .uleb128 0 +.Lcstend: + + +#ifdef SHARED + .hidden DW.ref.__gcc_personality_v0 + .weak DW.ref.__gcc_personality_v0 + .section .gnu.linkonce.d.DW.ref.__gcc_personality_v0,"aw",@progbits .align 8 -.LENDFDE: + .type DW.ref.__gcc_personality_v0, @object + .size DW.ref.__gcc_personality_v0, 8 +DW.ref.__gcc_personality_v0: + .quad __gcc_personality_v0 +#endif diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S index c3b2b51bdb..ccc18493a2 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_once.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002, 2003, 2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -17,6 +17,7 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ +#include <sysdep.h> #include <kernel-features.h> #include <tcb-offsets.h> #include <lowlevellock.h> @@ -32,6 +33,15 @@ .align 16 __pthread_once: .LSTARTCODE: + cfi_startproc +#ifdef SHARED + cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect, + DW.ref.__gcc_personality_v0) + cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART) +#else + cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) + cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) +#endif testl $2, (%rdi) jz 1f xorl %eax, %eax @@ -39,7 +49,7 @@ __pthread_once: /* Preserve the function pointer. */ 1: pushq %rsi -.Lpush_rsi: + cfi_adjust_cfa_offset(8) xorq %r10, %r10 /* Not yet initialized or initialization in progress. @@ -86,9 +96,9 @@ __pthread_once: /* Preserve the pointer to the control variable. */ 3: pushq %rdi -.Lpush_rdi: + cfi_adjust_cfa_offset(8) pushq %rdi -.Lpush_rdi2: + cfi_adjust_cfa_offset(8) .LcleanupSTART: callq *16(%rsp) @@ -96,14 +106,14 @@ __pthread_once: /* Get the control variable address back. */ popq %rdi -.Lpop_rdi: + cfi_adjust_cfa_offset(-8) /* Sucessful run of the initializer. Signal that we are done. */ LOCK incl (%rdi) addq $8, %rsp -.Ladd1: + cfi_adjust_cfa_offset(-8) /* Wake up all other threads. */ movl $0x7fffffff, %edx @@ -117,10 +127,9 @@ __pthread_once: syscall 4: addq $8, %rsp -.Ladd2: + cfi_adjust_cfa_offset(-8) xorl %eax, %eax retq - .size __pthread_once,.-__pthread_once @@ -134,6 +143,7 @@ pthread_once = __pthread_once .type clear_once_control,@function .align 16 clear_once_control: + cfi_adjust_cfa_offset(3 * 8) movq (%rsp), %rdi movq %rax, %r8 movl $0, (%rdi) @@ -153,15 +163,15 @@ clear_once_control: call _Unwind_Resume@PLT hlt .LENDCODE: + cfi_endproc .size clear_once_control,.-clear_once_control .section .gcc_except_table,"a",@progbits .LexceptSTART: - .byte 0xff # @LPStart format (omit) - .byte 0xff # @TType format (omit) - .byte 0x01 # call-site format - # DW_EH_PE_uleb128 + .byte DW_EH_PE_omit # @LPStart format + .byte DW_EH_PE_omit # @TType format + .byte DW_EH_PE_uleb128 # call-site format .uleb128 .Lcstend-.Lcstbegin .Lcstbegin: .uleb128 .LcleanupSTART-.LSTARTCODE @@ -175,100 +185,6 @@ clear_once_control: .Lcstend: - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE # Length of the CIE. -.LSTARTCIE: - .long 0 # CIE ID. - .byte 1 # Version number. -#ifdef SHARED - .string "zPLR" # NUL-terminated augmentation - # string. -#else - .string "zPL" # NUL-terminated augmentation - # string. -#endif - .uleb128 1 # Code alignment factor. - .sleb128 -8 # Data alignment factor. - .byte 16 # Return address register - # column. -#ifdef SHARED - .uleb128 7 # Augmentation value length. - .byte 0x9b # Personality: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4 - # + DW_EH_PE_indirect - .long DW.ref.__gcc_personality_v0-. - .byte 0x1b # LSDA Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. - .byte 0x1b # FDE Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. -#else - .uleb128 10 # Augmentation value length. - .byte 0x0 # Personality: absolute - .quad __gcc_personality_v0 - .byte 0x0 # LSDA Encoding: absolute -#endif - .byte 0x0c # DW_CFA_def_cfa - .uleb128 7 - .uleb128 8 - .byte 0x90 # DW_CFA_offset, column 0x10 - .uleb128 1 - .align 8 -.LENDCIE: - - .long .LENDFDE-.LSTARTFDE # Length of the FDE. -.LSTARTFDE: - .long .LSTARTFDE-.LSTARTFRAME # CIE pointer. -#ifdef SHARED - .long .LSTARTCODE-. # PC-relative start address - # of the code. - .long .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 4 # Augmentation size - .long .LexceptSTART-. -#else - .quad .LSTARTCODE # Start address of the code. - .quad .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 8 # Augmentation size - .quad .LexceptSTART -#endif - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_rsi-.LSTARTCODE - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_rdi-.Lpush_rsi - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_rdi2-.Lpush_rdi - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 32 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_rdi-.Lpush_rdi2 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 4 # DW_CFA_advance_loc4 - .long .Ladd1-.Lpop_rdi - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 4 # DW_CFA_advance_loc4 - .long .Ladd2-.Ladd1 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 8 - .byte 4 # DW_CFA_advance_loc4 - .long clear_once_control-.Ladd2 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 32 -#if 0 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_rdi3-clear_once_control - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 -#endif - .align 8 -.LENDFDE: - - #ifdef SHARED .hidden DW.ref.__gcc_personality_v0 .weak DW.ref.__gcc_personality_v0 diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S index 366c96fc36..23b218af34 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedrdlock.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2004, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002-2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -36,16 +36,21 @@ pthread_rwlock_timedrdlock: cfi_startproc pushq %r12 cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) pushq %r13 cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r13, 0) +#ifdef __ASSUME_FUTEX_CLOCK_REALTIME +# define VALREG %edx +#else pushq %r14 cfi_adjust_cfa_offset(8) - cfi_offset(%r12, -16) - cfi_offset(%r13, -24) - cfi_offset(%r14, -32) + cfi_rel_offset(%r14, 0) subq $16, %rsp cfi_adjust_cfa_offset(16) +# define VALREG %r14d +#endif movq %rdi, %r12 movq %rsi, %r13 @@ -76,7 +81,7 @@ pthread_rwlock_timedrdlock: incl READERS_QUEUED(%r12) je 4f - movl READERS_WAKEUP(%r12), %r14d + movl READERS_WAKEUP(%r12), VALREG /* Unlock. */ LOCK @@ -87,8 +92,33 @@ pthread_rwlock_timedrdlock: #endif jne 10f +11: +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +# ifdef PIC + cmpl $0, __have_futex_clock_realtime(%rip) +# else + cmpl $0, __have_futex_clock_realtime +# endif + je .Lreltmo +#endif + + movl $FUTEX_PRIVATE_FLAG|FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, %esi + xorl PSHARED(%r12), %esi + movq %r13, %r10 + movl $0xffffffff, %r9d +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + movl %r14d, %edx +#endif +21: leaq READERS_WAKEUP(%r12), %rdi + movl $SYS_futex, %eax + syscall + movq %rax, %rdx + +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + .subsection 2 +.Lreltmo: /* Get current time. */ -11: movq %rsp, %rdi + movq %rsp, %rdi xorl %esi, %esi movq $VSYSCALL_ADDR_vgettimeofday, %rax callq *%rax @@ -111,27 +141,26 @@ pthread_rwlock_timedrdlock: movq %rcx, (%rsp) /* Store relative timeout. */ movq %rdi, 8(%rsp) -#ifdef __ASSUME_PRIVATE_FUTEX +# ifdef __ASSUME_PRIVATE_FUTEX movl $FUTEX_PRIVATE_FLAG|FUTEX_WAIT, %esi xorl PSHARED(%r12), %esi -#else -# if FUTEX_WAIT == 0 - movl PSHARED(%r12), %esi # else +# if FUTEX_WAIT == 0 + movl PSHARED(%r12), %esi +# else movl $FUTEX_WAIT, %esi orl PSHARED(%r12), %esi -# endif +# endif xorl %fs:PRIVATE_FUTEX, %esi -#endif +# endif movq %rsp, %r10 movl %r14d, %edx - leaq READERS_WAKEUP(%r12), %rdi - movl $SYS_futex, %eax - syscall - movq %rax, %rdx -17: - /* Reget the lock. */ + jmp 21b + .previous +#endif + +17: /* Reget the lock. */ movl $1, %esi xorl %eax, %eax LOCK @@ -163,11 +192,13 @@ pthread_rwlock_timedrdlock: 7: movq %rdx, %rax +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME addq $16, %rsp cfi_adjust_cfa_offset(-16) popq %r14 cfi_adjust_cfa_offset(-8) cfi_restore(%r14) +#endif popq %r13 cfi_adjust_cfa_offset(-8) cfi_restore(%r13) @@ -176,10 +207,16 @@ pthread_rwlock_timedrdlock: cfi_restore(%r12) retq +#ifdef __ASSUME_PRIVATE_FUTEX + cfi_adjust_cfa_offset(16) + cfi_rel_offset(%r12, 8) + cfi_rel_offset(%r13, 0) +#else cfi_adjust_cfa_offset(40) cfi_offset(%r12, -16) cfi_offset(%r13, -24) cfi_offset(%r14, -32) +#endif 1: movl PSHARED(%rdi), %esi #if MUTEX != 0 addq $MUTEX, %rdi diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S index dde6b58836..cd867b60dc 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/pthread_rwlock_timedwrlock.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002, 2003, 2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -36,16 +36,21 @@ pthread_rwlock_timedwrlock: cfi_startproc pushq %r12 cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) pushq %r13 cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r13, 0) +#ifdef __ASSUME_FUTEX_CLOCK_REALTIME +# define VALREG %edx +#else pushq %r14 cfi_adjust_cfa_offset(8) - cfi_offset(%r12, -16) - cfi_offset(%r13, -24) - cfi_offset(%r14, -32) + cfi_rel_offset(%r14, 0) subq $16, %rsp cfi_adjust_cfa_offset(16) +# define VALREG %r14d +#endif movq %rdi, %r12 movq %rsi, %r13 @@ -74,7 +79,7 @@ pthread_rwlock_timedwrlock: incl WRITERS_QUEUED(%r12) je 4f - movl WRITERS_WAKEUP(%r12), %r14d + movl WRITERS_WAKEUP(%r12), VALREG LOCK #if MUTEX == 0 @@ -84,8 +89,33 @@ pthread_rwlock_timedwrlock: #endif jne 10f +11: +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +# ifdef PIC + cmpl $0, __have_futex_clock_realtime(%rip) +# else + cmpl $0, __have_futex_clock_realtime +# endif + je .Lreltmo +#endif + + movl $FUTEX_PRIVATE_FLAG|FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, %esi + xorl PSHARED(%r12), %esi + movq %r13, %r10 + movl $0xffffffff, %r9d +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + movl %r14d, %edx +#endif +21: leaq WRITERS_WAKEUP(%r12), %rdi + movl $SYS_futex, %eax + syscall + movq %rax, %rdx + +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + .subsection 2 +.Lreltmo: /* Get current time. */ -11: movq %rsp, %rdi + movq %rsp, %rdi xorl %esi, %esi movq $VSYSCALL_ADDR_vgettimeofday, %rax callq *%rax @@ -108,27 +138,26 @@ pthread_rwlock_timedwrlock: movq %rcx, (%rsp) /* Store relative timeout. */ movq %rdi, 8(%rsp) -#ifdef __ASSUME_PRIVATE_FUTEX +# ifdef __ASSUME_PRIVATE_FUTEX movl $FUTEX_PRIVATE_FLAG|FUTEX_WAIT, %esi xorl PSHARED(%r12), %esi -#else -# if FUTEX_WAIT == 0 - movl PSHARED(%r12), %esi # else +# if FUTEX_WAIT == 0 + movl PSHARED(%r12), %esi +# else movl $FUTEX_WAIT, %esi orl PSHARED(%r12), %esi -# endif +# endif xorl %fs:PRIVATE_FUTEX, %esi -#endif +# endif movq %rsp, %r10 movl %r14d, %edx - leaq WRITERS_WAKEUP(%r12), %rdi - movl $SYS_futex, %eax - syscall - movq %rax, %rdx -17: - /* Reget the lock. */ + jmp 21b + .previous +#endif + +17: /* Reget the lock. */ movl $1, %esi xorl %eax, %eax LOCK @@ -160,11 +189,13 @@ pthread_rwlock_timedwrlock: 7: movq %rdx, %rax +#ifndef __ASSUME_PRIVATE_FUTEX addq $16, %rsp cfi_adjust_cfa_offset(-16) popq %r14 cfi_adjust_cfa_offset(-8) cfi_restore(%r14) +#endif popq %r13 cfi_adjust_cfa_offset(-8) cfi_restore(%r13) @@ -173,10 +204,16 @@ pthread_rwlock_timedwrlock: cfi_restore(%r12) retq +#ifdef __ASSUME_PRIVATE_FUTEX + cfi_adjust_cfa_offset(16) + cfi_rel_offset(%r12, 8) + cfi_rel_offset(%r13, 0) +#else cfi_adjust_cfa_offset(40) cfi_offset(%r12, -16) cfi_offset(%r13, -24) cfi_offset(%r14, -32) +#endif 1: movl PSHARED(%rdi), %esi #if MUTEX != 0 addq $MUTEX, %rdi diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S index 88e99cf6a1..95762834d3 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_timedwait.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002, 2003, 2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -18,6 +18,7 @@ 02111-1307 USA. */ #include <sysdep.h> +#include <kernel-features.h> #include <lowlevellock.h> #include <shlib-compat.h> #include <pthread-errnos.h> @@ -34,6 +35,15 @@ .align 16 sem_timedwait: .LSTARTCODE: + cfi_startproc +#ifdef SHARED + cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect, + DW.ref.__gcc_personality_v0) + cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART) +#else + cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) + cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) +#endif #if VALUE == 0 movl (%rdi), %eax #else @@ -56,13 +66,21 @@ sem_timedwait: /* Check whether the timeout value is valid. */ 1: pushq %r12 -.Lpush_r12: + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) pushq %r13 -.Lpush_r13: + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r13, 0) pushq %r14 -.Lpush_r14: - subq $24, %rsp -.Lsubq: + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r14, 0) +#ifdef __ASSUME_FUTEX_CLOCK_REALTIME +# define STACKFRAME 8 +#else +# define STACKFRAME 24 +#endif + subq $STACKFRAME, %rsp + cfi_adjust_cfa_offset(STACKFRAME) movq %rdi, %r12 movq %rsi, %r13 @@ -75,67 +93,50 @@ sem_timedwait: LOCK addq $1, NWAITERS(%r12) -7: xorl %esi, %esi - movq %rsp, %rdi - movq $VSYSCALL_ADDR_vgettimeofday, %rax - callq *%rax - - /* Compute relative timeout. */ - movq 8(%rsp), %rax - movl $1000, %edi - mul %rdi /* Milli seconds to nano seconds. */ - movq (%r13), %rdi - movq 8(%r13), %rsi - subq (%rsp), %rdi - subq %rax, %rsi - jns 5f - addq $1000000000, %rsi - decq %rdi -5: testq %rdi, %rdi - movl $ETIMEDOUT, %r14d - js 6f /* Time is already up. */ - - movq %rdi, (%rsp) /* Store relative timeout. */ - movq %rsi, 8(%rsp) +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +# ifdef PIC + cmpl $0, __have_futex_clock_realtime(%rip) +# else + cmpl $0, __have_futex_clock_realtime +# endif + je .Lreltmo +#endif .LcleanupSTART: - call __pthread_enable_asynccancel - movl %eax, 16(%rsp) +13: call __pthread_enable_asynccancel + movl %eax, (%rsp) - movq %rsp, %r10 + movq %r13, %r10 #if VALUE == 0 movq %r12, %rdi #else leaq VALUE(%r12), %rdi #endif -#if FUTEX_WAIT == 0 - movl PRIVATE(%rdi), %esi -#else - movl $FUTEX_WAIT, %esi + movl $0xffffffff, %r9d + movl $FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, %esi orl PRIVATE(%rdi), %esi -#endif movl $SYS_futex, %eax xorl %edx, %edx syscall movq %rax, %r14 - movl 16(%rsp), %edi + movl (%rsp), %edi call __pthread_disable_asynccancel .LcleanupEND: testq %r14, %r14 - je 9f + je 11f cmpq $-EWOULDBLOCK, %r14 jne 3f -9: +11: #if VALUE == 0 movl (%r12), %eax #else movl VALUE(%r12), %eax #endif -8: testl %eax, %eax - je 7b +14: testl %eax, %eax + je 13b leaq -1(%rax), %rcx LOCK @@ -144,24 +145,30 @@ sem_timedwait: #else cmpxchgl %ecx, VALUE(%r12) #endif - jne 8b + jne 14b - xorl %eax, %eax +10: xorl %eax, %eax -10: LOCK +15: LOCK subq $1, NWAITERS(%r12) - addq $24, %rsp -.Laddq: + addq $STACKFRAME, %rsp + cfi_adjust_cfa_offset(-STACKFRAME) popq %r14 -.Lpop_r14: + cfi_adjust_cfa_offset(-8) + cfi_restore(%r14) popq %r13 -.Lpop_r13: + cfi_adjust_cfa_offset(-8) + cfi_restore(%r13) popq %r12 -.Lpop_r12: + cfi_adjust_cfa_offset(-8) + cfi_restore(%r12) retq -.Lafter_retq: + cfi_adjust_cfa_offset(STACKFRAME + 3 * 8) + cfi_rel_offset(%r12, STACKFRAME + 2 * 8) + cfi_rel_offset(%r13, STACKFRAME + 1 * 8) + cfi_rel_offset(%r14, STACKFRAME) 3: negq %r14 6: #if USE___THREAD @@ -173,7 +180,82 @@ sem_timedwait: #endif orl $-1, %eax + jmp 15b + +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME +.Lreltmo: +7: xorl %esi, %esi + movq %rsp, %rdi + movq $VSYSCALL_ADDR_vgettimeofday, %rax + callq *%rax + + /* Compute relative timeout. */ + movq 8(%rsp), %rax + movl $1000, %edi + mul %rdi /* Milli seconds to nano seconds. */ + movq (%r13), %rdi + movq 8(%r13), %rsi + subq (%rsp), %rdi + subq %rax, %rsi + jns 5f + addq $1000000000, %rsi + decq %rdi +5: testq %rdi, %rdi + movl $ETIMEDOUT, %r14d + js 6b /* Time is already up. */ + + movq %rdi, (%rsp) /* Store relative timeout. */ + movq %rsi, 8(%rsp) + +.LcleanupSTART2: + call __pthread_enable_asynccancel + movl %eax, 16(%rsp) + + movq %rsp, %r10 +# if VALUE == 0 + movq %r12, %rdi +# else + leaq VALUE(%r12), %rdi +# endif +# if FUTEX_WAIT == 0 + movl PRIVATE(%rdi), %esi +# else + movl $FUTEX_WAIT, %esi + orl PRIVATE(%rdi), %esi +# endif + movl $SYS_futex, %eax + xorl %edx, %edx + syscall + movq %rax, %r14 + + movl 16(%rsp), %edi + call __pthread_disable_asynccancel +.LcleanupEND2: + + testq %r14, %r14 + je 9f + cmpq $-EWOULDBLOCK, %r14 + jne 3b + +9: +# if VALUE == 0 + movl (%r12), %eax +# else + movl VALUE(%r12), %eax +# endif +8: testl %eax, %eax + je 7b + + leaq -1(%rax), %rcx + LOCK +# if VALUE == 0 + cmpxchgl %ecx, (%r12) +# else + cmpxchgl %ecx, VALUE(%r12) +# endif + jne 8b jmp 10b +#endif .size sem_timedwait,.-sem_timedwait @@ -186,21 +268,27 @@ sem_timedwait_cleanup: call _Unwind_Resume@PLT hlt .LENDCODE: + cfi_endproc .size sem_timedwait_cleanup,.-sem_timedwait_cleanup .section .gcc_except_table,"a",@progbits .LexceptSTART: - .byte 0xff # @LPStart format (omit) - .byte 0xff # @TType format (omit) - .byte 0x01 # call-site format - # DW_EH_PE_uleb128 + .byte DW_EH_PE_omit # @LPStart format + .byte DW_EH_PE_omit # @TType format + .byte DW_EH_PE_uleb128 # call-site format .uleb128 .Lcstend-.Lcstbegin .Lcstbegin: .uleb128 .LcleanupSTART-.LSTARTCODE .uleb128 .LcleanupEND-.LcleanupSTART .uleb128 sem_timedwait_cleanup-.LSTARTCODE .uleb128 0 +#ifndef __ASSUME_FUTEX_CLOCK_REALTIME + .uleb128 .LcleanupSTART2-.LSTARTCODE + .uleb128 .LcleanupEND2-.LcleanupSTART2 + .uleb128 sem_timedwait_cleanup-.LSTARTCODE + .uleb128 0 +#endif .uleb128 .LcallUR-.LSTARTCODE .uleb128 .LENDCODE-.LcallUR .uleb128 0 @@ -208,118 +296,6 @@ sem_timedwait_cleanup: .Lcstend: - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE # Length of the CIE. -.LSTARTCIE: - .long 0 # CIE ID. - .byte 1 # Version number. -#ifdef SHARED - .string "zPLR" # NUL-terminated augmentation - # string. -#else - .string "zPL" # NUL-terminated augmentation - # string. -#endif - .uleb128 1 # Code alignment factor. - .sleb128 -8 # Data alignment factor. - .byte 16 # Return address register - # column. -#ifdef SHARED - .uleb128 7 # Augmentation value length. - .byte 0x9b # Personality: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4 - # + DW_EH_PE_indirect - .long DW.ref.__gcc_personality_v0-. - .byte 0x1b # LSDA Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. - .byte 0x1b # FDE Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. -#else - .uleb128 10 # Augmentation value length. - .byte 0x0 # Personality: absolute - .quad __gcc_personality_v0 - .byte 0x0 # LSDA Encoding: absolute -#endif - .byte 0x0c # DW_CFA_def_cfa - .uleb128 7 - .uleb128 8 - .byte 0x90 # DW_CFA_offset, column 0x10 - .uleb128 1 - .align 8 -.LENDCIE: - - .long .LENDFDE-.LSTARTFDE # Length of the FDE. -.LSTARTFDE: - .long .LSTARTFDE-.LSTARTFRAME # CIE pointer. -#ifdef SHARED - .long .LSTARTCODE-. # PC-relative start address - # of the code. - .long .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 4 # Augmentation size - .long .LexceptSTART-. -#else - .quad .LSTARTCODE # Start address of the code. - .quad .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 8 # Augmentation size - .quad .LexceptSTART -#endif - - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_r12-.LSTARTCODE - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_r13-.Lpush_r12 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 0x8d # DW_CFA_offset %r13 - .uleb128 3 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_r14-.Lpush_r13 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 32 - .byte 0x8e # DW_CFA_offset %r14 - .uleb128 4 - .byte 4 # DW_CFA_advance_loc4 - .long .Lsubq-.Lpush_r14 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 56 - .byte 4 # DW_CFA_advance_loc4 - .long .Laddq-.Lsubq - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 32 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_r14-.Laddq - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 0xce # DW_CFA_restore %r14 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_r13-.Lpop_r14 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0xcd # DW_CFA_restore %r13 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_r12-.Lpop_r13 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 8 - .byte 0xcc # DW_CFA_restore %r12 - .byte 4 # DW_CFA_advance_loc4 - .long .Lafter_retq-.Lpop_r12 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 56 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .byte 0x8d # DW_CFA_offset %r13 - .uleb128 3 - .byte 0x8e # DW_CFA_offset %r14 - .uleb128 4 - .align 8 -.LENDFDE: - - #ifdef SHARED .hidden DW.ref.__gcc_personality_v0 .weak DW.ref.__gcc_personality_v0 diff --git a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S index 5320a91e19..a01d745a17 100644 --- a/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S +++ b/nptl/sysdeps/unix/sysv/linux/x86_64/sem_wait.S @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003, 2005, 2007 Free Software Foundation, Inc. +/* Copyright (C) 2002, 2003, 2005, 2007, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -31,16 +31,20 @@ .align 16 sem_wait: .LSTARTCODE: - pushq %r12 -.Lpush_r12: - pushq %r13 -.Lpush_r13: - movq %rdi, %r13 + cfi_startproc +#ifdef SHARED + cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect, + DW.ref.__gcc_personality_v0) + cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART) +#else + cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0) + cfi_lsda(DW_EH_PE_udata4, .LexceptSTART) +#endif #if VALUE == 0 - movl (%r13), %eax + movl (%rdi), %eax #else - movl VALUE(%r13), %eax + movl VALUE(%rdi), %eax #endif 2: testl %eax, %eax je 1f @@ -48,23 +52,24 @@ sem_wait: leal -1(%rax), %edx LOCK #if VALUE == 0 - cmpxchgl %edx, (%r13) + cmpxchgl %edx, (%rdi) #else - cmpxchgl %edx, VALUE(%r13) + cmpxchgl %edx, VALUE(%rdi) #endif jne 2b -7: xorl %eax, %eax - -9: popq %r13 -.Lpop_r13: - popq %r12 -.Lpop_r12: - + xorl %eax, %eax retq -.Lafter_retq: -1: LOCK +1: pushq %r12 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r12, 0) + pushq %r13 + cfi_adjust_cfa_offset(8) + cfi_rel_offset(%r13, 0) + movq %rdi, %r13 + + LOCK addq $1, NWAITERS(%r13) .LcleanupSTART: @@ -113,8 +118,21 @@ sem_wait: LOCK subq $1, NWAITERS(%r13) - jmp 7b + xorl %eax, %eax + +9: popq %r13 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r13) + popq %r12 + cfi_adjust_cfa_offset(-8) + cfi_restore(%r12) + + retq + + cfi_adjust_cfa_offset(2 * 8) + cfi_rel_offset(%r12, 8) + cfi_rel_offset(%r13, 0) 4: negq %r12 #if USE___THREAD movq errno@gottpoff(%rip), %rdx @@ -141,15 +159,15 @@ sem_wait_cleanup: call _Unwind_Resume@PLT hlt .LENDCODE: + cfi_endproc .size sem_wait_cleanup,.-sem_wait_cleanup .section .gcc_except_table,"a",@progbits .LexceptSTART: - .byte 0xff # @LPStart format (omit) - .byte 0xff # @TType format (omit) - .byte 0x01 # call-site format - # DW_EH_PE_uleb128 + .byte DW_EH_PE_omit # @LPStart format + .byte DW_EH_PE_omit # @TType format + .byte DW_EH_PE_uleb128 # call-site format .uleb128 .Lcstend-.Lcstbegin .Lcstbegin: .uleb128 .LcleanupSTART-.LSTARTCODE @@ -163,97 +181,6 @@ sem_wait_cleanup: .Lcstend: - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE # Length of the CIE. -.LSTARTCIE: - .long 0 # CIE ID. - .byte 1 # Version number. -#ifdef SHARED - .string "zPLR" # NUL-terminated augmentation - # string. -#else - .string "zPL" # NUL-terminated augmentation - # string. -#endif - .uleb128 1 # Code alignment factor. - .sleb128 -8 # Data alignment factor. - .byte 16 # Return address register - # column. -#ifdef SHARED - .uleb128 7 # Augmentation value length. - .byte 0x9b # Personality: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4 - # + DW_EH_PE_indirect - .long DW.ref.__gcc_personality_v0-. - .byte 0x1b # LSDA Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. - .byte 0x1b # FDE Encoding: DW_EH_PE_pcrel - # + DW_EH_PE_sdata4. -#else - .uleb128 10 # Augmentation value length. - .byte 0x0 # Personality: absolute - .quad __gcc_personality_v0 - .byte 0x0 # LSDA Encoding: absolute -#endif - .byte 0x0c # DW_CFA_def_cfa - .uleb128 7 - .uleb128 8 - .byte 0x90 # DW_CFA_offset, column 0x10 - .uleb128 1 - .align 8 -.LENDCIE: - - .long .LENDFDE-.LSTARTFDE # Length of the FDE. -.LSTARTFDE: - .long .LSTARTFDE-.LSTARTFRAME # CIE pointer. -#ifdef SHARED - .long .LSTARTCODE-. # PC-relative start address - # of the code. - .long .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 4 # Augmentation size - .long .LexceptSTART-. -#else - .quad .LSTARTCODE # Start address of the code. - .quad .LENDCODE-.LSTARTCODE # Length of the code. - .uleb128 8 # Augmentation size - .quad .LexceptSTART -#endif - - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_r12-.LSTARTCODE - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpush_r13-.Lpush_r12 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 0x8d # DW_CFA_offset %r13 - .uleb128 3 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_r13-.Lpush_r13 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 16 - .byte 0xcd # DW_CFA_restore %r13 - .byte 4 # DW_CFA_advance_loc4 - .long .Lpop_r12-.Lpop_r13 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 8 - .byte 0xcc # DW_CFA_restore %r12 - .byte 4 # DW_CFA_advance_loc4 - .long .Lafter_retq-.Lpop_r12 - .byte 14 # DW_CFA_def_cfa_offset - .uleb128 24 - .byte 0x8c # DW_CFA_offset %r12 - .uleb128 2 - .byte 0x8d # DW_CFA_offset %r13 - .uleb128 3 - .align 8 -.LENDFDE: - - #ifdef SHARED .hidden DW.ref.__gcc_personality_v0 .weak DW.ref.__gcc_personality_v0 diff --git a/nptl/tst-cond11.c b/nptl/tst-cond11.c index 0de4d56137..4d0c7dd225 100644 --- a/nptl/tst-cond11.c +++ b/nptl/tst-cond11.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2003, 2004 Free Software Foundation, Inc. +/* Copyright (C) 2003, 2004, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2003. @@ -130,6 +130,20 @@ run_test (clockid_t cl) return 1; } + struct timespec ts2; + if (clock_gettime (cl, &ts2) != 0) + { + puts ("second clock_gettime failed"); + return 1; + } + + if (ts2.tv_sec < ts.tv_sec + || (ts2.tv_sec == ts.tv_sec && ts2.tv_nsec < ts.tv_nsec)) + { + puts ("timeout too short"); + return 1; + } + if (pthread_mutex_unlock (&mut) != 0) { puts ("mutex_unlock failed"); diff --git a/nptl/tst-sem5.c b/nptl/tst-sem5.c index cb85b8e769..d3ebe26a40 100644 --- a/nptl/tst-sem5.c +++ b/nptl/tst-sem5.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2002, 2003 Free Software Foundation, Inc. +/* Copyright (C) 2002, 2003, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Ulrich Drepper <drepper@redhat.com>, 2002. @@ -73,6 +73,20 @@ do_test (void) return 1; } + struct timespec ts2; + if (clock_gettime (CLOCK_REALTIME, &ts2) != 0) + { + puts ("clock_gettime failed"); + return 1; + } + + if (ts2.tv_sec < ts.tv_sec + || (ts2.tv_sec == ts.tv_sec && ts2.tv_nsec < ts.tv_nsec)) + { + puts ("timeout too short"); + return 1; + } + return 0; } diff --git a/nscd/cache.c b/nscd/cache.c index ab842efc29..3e6793df2f 100644 --- a/nscd/cache.c +++ b/nscd/cache.c @@ -179,7 +179,7 @@ cache_add (int type, const void *key, size_t len, struct datahead *packet, /* Put the new entry in the first position. */ do newp->next = table->head->array[hash]; - while (atomic_compare_and_exchange_bool_acq (&table->head->array[hash], + while (atomic_compare_and_exchange_bool_rel (&table->head->array[hash], (ref_t) ((char *) newp - table->data), (ref_t) newp->next)); diff --git a/resolv/res_mkquery.c b/resolv/res_mkquery.c index ae0cdb417e..2dda4c0f45 100644 --- a/resolv/res_mkquery.c +++ b/resolv/res_mkquery.c @@ -244,7 +244,7 @@ __res_nopt(res_state statp, *cp++ = 0; /* "." */ NS_PUT16(T_OPT, cp); /* TYPE */ - NS_PUT16(anslen & 0xffff, cp); /* CLASS = UDP payload size */ + NS_PUT16(MIN(anslen, 0xffff), cp); /* CLASS = UDP payload size */ *cp++ = NOERROR; /* extended RCODE */ *cp++ = 0; /* EDNS version */ /* XXX Once we support DNSSEC we change the flag value here. */ diff --git a/string/stpncpy.c b/string/stpncpy.c index 164d0f1747..2ebab33d8a 100644 --- a/string/stpncpy.c +++ b/string/stpncpy.c @@ -28,17 +28,19 @@ # include <sys/types.h> #endif -#ifndef weak_alias -# define __stpncpy stpncpy +#ifndef STPNCPY +# ifdef weak_alias +# define STPNCPY __stpncpy +weak_alias (__stpncpy, stpncpy) +# else +# define STPNCPY stpncpy +# endif #endif /* Copy no more than N characters of SRC to DEST, returning the address of the terminating '\0' in DEST, if any, or else DEST + N. */ char * -__stpncpy (dest, src, n) - char *dest; - const char *src; - size_t n; +STPNCPY (char *dest, const char *src, size_t n) { char c; char *s = dest; @@ -96,5 +98,4 @@ __stpncpy (dest, src, n) } #ifdef weak_alias libc_hidden_def (__stpncpy) -weak_alias (__stpncpy, stpncpy) #endif diff --git a/string/strncpy.c b/string/strncpy.c index f32612e1cf..2274d7d31e 100644 --- a/string/strncpy.c +++ b/string/strncpy.c @@ -21,11 +21,12 @@ #undef strncpy +#ifndef STRNCPY +#define STRNCPY strncpy +#endif + char * -strncpy (s1, s2, n) - char *s1; - const char *s2; - size_t n; +STRNCPY (char *s1, const char *s2, size_t n) { reg_char c; char *s = s1; diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h index b1af7fde0a..30f9d23091 100644 --- a/sysdeps/generic/ldsodefs.h +++ b/sysdeps/generic/ldsodefs.h @@ -335,6 +335,10 @@ struct audit_ifaces extern int _dl_name_match_p (const char *__name, const struct link_map *__map) internal_function; +/* Compute next higher prime number. */ +extern unsigned long int _dl_higher_prime_number (unsigned long int n) + internal_function; + /* Function used as argument for `_dl_receive_error' function. The arguments are the error code, error string, and the objname the error occurred in. */ @@ -383,6 +387,21 @@ struct rtld_global allocated by rtld. Later it keeps the size of the map. It might be reset if in _dl_close if the last global object is removed. */ size_t _ns_global_scope_alloc; + /* Search table for unique objects. */ + struct unique_sym_table + { + __rtld_lock_recursive_t lock; + struct unique_sym + { + uint32_t hashval; + const char *name; + const ElfW(Sym) *sym; + const struct link_map *map; + } *entries; + size_t size; + size_t n_elements; + void (*free) (void *); + } _ns_unique_sym_table; /* Keep track of changes to each namespace' list. */ struct r_debug _ns_debug; } _dl_ns[DL_NNS]; diff --git a/sysdeps/generic/sysdep.h b/sysdeps/generic/sysdep.h index 15d951c777..54884d9afe 100644 --- a/sysdeps/generic/sysdep.h +++ b/sysdeps/generic/sysdep.h @@ -1,5 +1,5 @@ /* Generic asm macros used on many machines. - Copyright (C) 1991,92,93,96,98,2002,2003 Free Software Foundation, Inc. + Copyright (C) 1991,92,93,96,98,2002,2003,2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -39,13 +39,13 @@ #ifdef __ASSEMBLER__ /* Mark the end of function named SYM. This is used on some platforms to generate correct debugging information. */ -#ifndef END -#define END(sym) -#endif +# ifndef END +# define END(sym) +# endif -#ifndef JUMPTARGET -#define JUMPTARGET(sym) sym -#endif +# ifndef JUMPTARGET +# define JUMPTARGET(sym) sym +# endif /* Makros to generate eh_frame unwind information. */ # ifdef HAVE_ASM_CFI_DIRECTIVES @@ -65,6 +65,8 @@ # define cfi_remember_state .cfi_remember_state # define cfi_restore_state .cfi_restore_state # define cfi_window_save .cfi_window_save +# define cfi_personality(enc, exp) .cfi_personality enc, exp +# define cfi_lsda(enc, exp) .cfi_lsda enc, exp # else # define cfi_startproc # define cfi_endproc @@ -82,6 +84,8 @@ # define cfi_remember_state # define cfi_restore_state # define cfi_window_save +# define cfi_personality(enc, exp) +# define cfi_lsda(enc, exp) # endif #else /* ! ASSEMBLER */ @@ -116,6 +120,10 @@ ".cfi_restore_state" # define CFI_WINDOW_SAVE \ ".cfi_window_save" +# define CFI_PERSONALITY(enc, exp) \ + ".cfi_personality " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) +# define CFI_LSDA(enc, exp) \ + ".cfi_lsda " CFI_STRINGIFY(enc) "," CFI_STRINGIFY(exp) # else # define CFI_STARTPROC # define CFI_ENDPROC @@ -132,6 +140,27 @@ # define CFI_REMEMBER_STATE # define CFI_RESTORE_STATE # define CFI_WINDOW_SAVE +# define CFI_PERSONALITY(enc, exp) +# define CFI_LSDA(enc, exp) # endif #endif /* __ASSEMBLER__ */ + +/* Values used for encoding parameter of cfi_personality and cfi_lsda. */ +#define DW_EH_PE_absptr 0x00 +#define DW_EH_PE_omit 0xff +#define DW_EH_PE_uleb128 0x01 +#define DW_EH_PE_udata2 0x02 +#define DW_EH_PE_udata4 0x03 +#define DW_EH_PE_udata8 0x04 +#define DW_EH_PE_sleb128 0x09 +#define DW_EH_PE_sdata2 0x0a +#define DW_EH_PE_sdata4 0x0b +#define DW_EH_PE_sdata8 0x0c +#define DW_EH_PE_signed 0x08 +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 +#define DW_EH_PE_indirect 0x80 diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure index d1d4dc15a7..cbc8cd9206 100755 --- a/sysdeps/i386/configure +++ b/sysdeps/i386/configure @@ -1,10 +1,42 @@ +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + # This file is generated from configure.in by Autoconf. DO NOT EDIT! # Local configure fragment for sysdeps/i386. -echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5 -echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6 +{ $as_echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5 +$as_echo_n "checking if -g produces usable source locations for assembler-with-cpp... " >&6; } if test "${libc_cv_cpp_asm_debuginfo+set}" = set; then - echo $ECHO_N "(cached) $ECHO_C" >&6 + $as_echo_n "(cached) " >&6 else cat > conftest.S <<EOF #include "confdefs.h" @@ -27,7 +59,7 @@ if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? - echo "$as_me:$LINENO: \$? = $ac_status" >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } && { ac_pattern='conftest\.S' { ac_try='readelf --debug-dump=line conftest.o | @@ -35,7 +67,7 @@ if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5' { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 (eval $ac_try) 2>&5 ac_status=$? - echo "$as_me:$LINENO: \$? = $ac_status" >&5 + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 (exit $ac_status); }; } }; then libc_cv_cpp_asm_debuginfo=yes @@ -44,11 +76,36 @@ else fi rm -f conftest* fi -echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5 -echo "${ECHO_T}$libc_cv_cpp_asm_debuginfo" >&6 +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5 +$as_echo "$libc_cv_cpp_asm_debuginfo" >&6; } if test $libc_cv_cpp_asm_debuginfo = yes; then cat >>confdefs.h <<\_ACEOF #define HAVE_CPP_ASM_DEBUGINFO 1 _ACEOF fi + +{ $as_echo "$as_me:$LINENO: checking for SSE4 support" >&5 +$as_echo_n "checking for SSE4 support... " >&6; } +if test "${libc_cv_cc_sse4+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -msse4 -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_sse4=yes +else + libc_cv_cc_sse4=no +fi +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_sse4" >&5 +$as_echo "$libc_cv_cc_sse4" >&6; } +if test $libc_cv_cc_sse4 = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_SSE4_SUPPORT 1 +_ACEOF + +fi diff --git a/sysdeps/i386/configure.in b/sysdeps/i386/configure.in index 028e1ae8e1..44f53a57a0 100644 --- a/sysdeps/i386/configure.in +++ b/sysdeps/i386/configure.in @@ -33,3 +33,14 @@ rm -f conftest*])AC_SUBST(libc_cv_cpp_asm_debuginfo) if test $libc_cv_cpp_asm_debuginfo = yes; then AC_DEFINE(HAVE_CPP_ASM_DEBUGINFO) fi + +dnl Check if -msse4 works. +AC_CACHE_CHECK(for SSE4 support, libc_cv_cc_sse4, [dnl +if AC_TRY_COMMAND([${CC-cc} -msse4 -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_sse4=yes +else + libc_cv_cc_sse4=no +fi]) +if test $libc_cv_cc_sse4 = yes; then + AC_DEFINE(HAVE_SSE4_SUPPORT) +fi diff --git a/sysdeps/powerpc/sysdep.h b/sysdeps/powerpc/sysdep.h index 43edeb71eb..f5c79c54ef 100644 --- a/sysdeps/powerpc/sysdep.h +++ b/sysdeps/powerpc/sysdep.h @@ -44,8 +44,8 @@ #define PPC_FEATURE_PA6T 0x00000800 /* PA Semi 6T Core */ #define PPC_FEATURE_HAS_DFP 0x00000400 /* Decimal FP Unit */ #define PPC_FEATURE_POWER6_EXT 0x00000200 /* P6 + mffgpr/mftgpr */ -#define PPC_FEATURE_HAS_VSX 0x00000100 /* P7 Vector Extension. */ -#define PPC_FEATURE_ARCH_2_06 0x00000080 /* ISA 2.06 */ +#define PPC_FEATURE_ARCH_2_06 0x00000100 /* ISA 2.06 */ +#define PPC_FEATURE_HAS_VSX 0x00000080 /* P7 Vector Extension. */ #define PPC_FEATURE_970 (PPC_FEATURE_POWER4 + PPC_FEATURE_HAS_ALTIVEC) #ifdef __ASSEMBLER__ diff --git a/sysdeps/unix/sysv/linux/sys/epoll.h b/sysdeps/unix/sysv/linux/sys/epoll.h index 12de0bcfe2..ca1d3d0459 100644 --- a/sysdeps/unix/sysv/linux/sys/epoll.h +++ b/sysdeps/unix/sysv/linux/sys/epoll.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2002-2006, 2007, 2008 Free Software Foundation, Inc. +/* Copyright (C) 2002-2006, 2007, 2008, 2009 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -31,7 +31,7 @@ typedef __sigset_t sigset_t; #endif -/* Flags to be passed to epoll_create2. */ +/* Flags to be passed to epoll_create1. */ enum { EPOLL_CLOEXEC = 02000000, diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile index da82093381..78fdb04fcb 100644 --- a/sysdeps/x86_64/Makefile +++ b/sysdeps/x86_64/Makefile @@ -4,6 +4,7 @@ long-double-fcts = yes ifeq ($(subdir),csu) sysdep_routines += hp-timing elide-routines.os += hp-timing +gen-as-const-headers += link-defines.sym endif ifeq ($(subdir),gmon) diff --git a/sysdeps/x86_64/bits/link.h b/sysdeps/x86_64/bits/link.h index 5676b78753..643a293bb0 100644 --- a/sysdeps/x86_64/bits/link.h +++ b/sysdeps/x86_64/bits/link.h @@ -65,10 +65,19 @@ __END_DECLS /* Registers for entry into PLT on x86-64. */ # if __GNUC_PREREQ (4,0) typedef float La_x86_64_xmm __attribute__ ((__vector_size__ (16))); +typedef float La_x86_64_ymm __attribute__ ((__vector_size__ (32))); # else typedef float La_x86_64_xmm __attribute__ ((__mode__ (__V4SF__))); # endif +typedef union +{ +# if __GNUC_PREREQ (4,0) + La_x86_64_ymm ymm[2]; +# endif + La_x86_64_xmm xmm[4]; +} La_x86_64_vector __attribute__ ((aligned(16))); + typedef struct La_x86_64_regs { uint64_t lr_rdx; @@ -80,6 +89,7 @@ typedef struct La_x86_64_regs uint64_t lr_rbp; uint64_t lr_rsp; La_x86_64_xmm lr_xmm[8]; + La_x86_64_vector lr_vector[8]; } La_x86_64_regs; /* Return values for calls from PLT on x86-64. */ @@ -91,6 +101,8 @@ typedef struct La_x86_64_retval La_x86_64_xmm lrv_xmm1; long double lrv_st0; long double lrv_st1; + La_x86_64_vector lrv_vector0; + La_x86_64_vector lrv_vector1; } La_x86_64_retval; diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S index d8d9bc12a4..49d239f075 100644 --- a/sysdeps/x86_64/dl-trampoline.S +++ b/sysdeps/x86_64/dl-trampoline.S @@ -17,7 +17,9 @@ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */ +#include <config.h> #include <sysdep.h> +#include <link-defines.h> .text .globl _dl_runtime_resolve @@ -89,25 +91,85 @@ _dl_runtime_profile: /* Actively align the La_x86_64_regs structure. */ andq $0xfffffffffffffff0, %rsp - subq $192, %rsp # sizeof(La_x86_64_regs) +# ifdef HAVE_AVX_SUPPORT + /* sizeof(La_x86_64_regs). Need extra space for 8 SSE registers + to detect if any xmm0-xmm7 registers are changed by audit + module. */ + subq $(LR_SIZE + XMM_SIZE*8), %rsp +# else + subq $LR_SIZE, %rsp # sizeof(La_x86_64_regs) +# endif movq %rsp, 24(%rbx) - movq %rdx, (%rsp) # Fill the La_x86_64_regs structure. - movq %r8, 8(%rsp) - movq %r9, 16(%rsp) - movq %rcx, 24(%rsp) - movq %rsi, 32(%rsp) - movq %rdi, 40(%rsp) - movq %rbp, 48(%rsp) + /* Fill the La_x86_64_regs structure. */ + movq %rdx, LR_RDX_OFFSET(%rsp) + movq %r8, LR_R8_OFFSET(%rsp) + movq %r9, LR_R9_OFFSET(%rsp) + movq %rcx, LR_RCX_OFFSET(%rsp) + movq %rsi, LR_RSI_OFFSET(%rsp) + movq %rdi, LR_RDI_OFFSET(%rsp) + movq %rbp, LR_RBP_OFFSET(%rsp) + leaq 48(%rbx), %rax - movq %rax, 56(%rsp) - movaps %xmm0, 64(%rsp) - movaps %xmm1, 80(%rsp) - movaps %xmm2, 96(%rsp) - movaps %xmm3, 112(%rsp) - movaps %xmm4, 128(%rsp) - movaps %xmm5, 144(%rsp) - movaps %xmm7, 160(%rsp) + movq %rax, LR_RSP_OFFSET(%rsp) + + /* We always store the XMM registers even if AVX is available. + This is to provide backward binary compatility for existing + audit modules. */ + movaps %xmm0, (LR_XMM_OFFSET)(%rsp) + movaps %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp) + movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp) + movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp) + movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp) + movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp) + movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp) + movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp) + +# ifdef HAVE_AVX_SUPPORT + .data +L(have_avx): + .zero 4 + .size L(have_avx), 4 + .previous + + cmpl $0, L(have_avx)(%rip) + jne 1f + movq %rbx, %r11 # Save rbx + movl $1, %eax + cpuid + movq %r11,%rbx # Restore rbx + movl $1, %eax + testl $(1 << 28), %ecx + jne 2f + negl %eax +2: movl %eax, L(have_avx)(%rip) + cmpl $0, %eax + +1: js L(no_avx1) + + /* This is to support AVX audit modules. */ + vmovdqu %ymm0, (LR_VECTOR_OFFSET)(%rsp) + vmovdqu %ymm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp) + vmovdqu %ymm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp) + vmovdqu %ymm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp) + vmovdqu %ymm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp) + vmovdqu %ymm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp) + vmovdqu %ymm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp) + vmovdqu %ymm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp) + + /* Save xmm0-xmm7 registers to detect if any of them are + changed by audit module. */ + vmovdqa %xmm0, (LR_SIZE)(%rsp) + vmovdqa %xmm1, (LR_SIZE + XMM_SIZE)(%rsp) + vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp) + vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp) + vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp) + vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp) + vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp) + vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp) + +L(no_avx1): +# endif movq %rsp, %rcx # La_x86_64_regs pointer to %rcx. movq 48(%rbx), %rdx # Load return address if needed. @@ -119,27 +181,87 @@ _dl_runtime_profile: movq %rax, %r11 # Save return value. movq 8(%rbx), %rax # Get back register content. - movq (%rsp), %rdx - movq 8(%rsp), %r8 - movq 16(%rsp), %r9 - movaps 64(%rsp), %xmm0 - movaps 80(%rsp), %xmm1 - movaps 96(%rsp), %xmm2 - movaps 112(%rsp), %xmm3 - movaps 128(%rsp), %xmm4 - movaps 144(%rsp), %xmm5 - movaps 160(%rsp), %xmm7 - + movq LR_RDX_OFFSET(%rsp), %rdx + movq LR_R8_OFFSET(%rsp), %r8 + movq LR_R9_OFFSET(%rsp), %r9 + + movaps (LR_XMM_OFFSET)(%rsp), %xmm0 + movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1 + movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2 + movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3 + movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4 + movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5 + movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6 + movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7 + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx2) + + /* Check if any xmm0-xmm7 registers are changed by audit + module. */ + vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET)(%rsp), %ymm0 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %ymm1 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %ymm2 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %ymm3 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %ymm4 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %ymm5 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %ymm6 + +1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8 + vpmovmskb %xmm8, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %ymm7 + +L(no_avx2): +1: +# endif movq 16(%rbx), %r10 # Anything in framesize? testq %r10, %r10 - jns 1f + jns 3f /* There's nothing in the frame size, so there will be no call to the _dl_call_pltexit. */ - movq 24(%rsp), %rcx # Get back registers content. - movq 32(%rsp), %rsi - movq 40(%rsp), %rdi + /* Get back registers content. */ + movq LR_RCX_OFFSET(%rsp), %rcx + movq LR_RSI_OFFSET(%rsp), %rsi + movq LR_RDI_OFFSET(%rsp), %rdi movq %rbx, %rsp movq (%rsp), %rbx @@ -151,7 +273,7 @@ _dl_runtime_profile: cfi_adjust_cfa_offset(-48) jmp *%r11 # Jump to function address. -1: +3: cfi_adjust_cfa_offset(48) cfi_rel_offset(%rbx, 0) cfi_def_cfa_register(%rbx) @@ -161,7 +283,7 @@ _dl_runtime_profile: temporary buffer of the size specified by the 'framesize' returned from _dl_profile_fixup */ - leaq 56(%rbx), %rsi # stack + leaq LR_RSP_OFFSET(%rbx), %rsi # stack addq $8, %r10 andq $0xfffffffffffffff0, %r10 movq %r10, %rcx @@ -183,31 +305,80 @@ _dl_runtime_profile: _dl_call_pltexit. The La_x86_64_regs is being pointed by rsp now, so we just need to allocate the sizeof(La_x86_64_retval) space on the stack, since the alignment has already been taken care of. */ - - subq $80, %rsp # sizeof(La_x86_64_retval) +# ifdef HAVE_AVX_SUPPORT + /* sizeof(La_x86_64_retval). Need extra space for 2 SSE + registers to detect if xmm0/xmm1 registers are changed + by audit module. */ + subq $(LRV_SIZE + XMM_SIZE*2), %rsp +# else + subq $LRV_SIZE, %rsp # sizeof(La_x86_64_retval) +# endif movq %rsp, %rcx # La_x86_64_retval argument to %rcx. - movq %rax, (%rcx) # Fill in the La_x86_64_retval structure. - movq %rdx, 8(%rcx) - movaps %xmm0, 16(%rcx) - movaps %xmm1, 32(%rcx) - fstpt 48(%rcx) - fstpt 64(%rcx) + /* Fill in the La_x86_64_retval structure. */ + movq %rax, LRV_RAX_OFFSET(%rcx) + movq %rdx, LRV_RDX_OFFSET(%rcx) + + movaps %xmm0, LRV_XMM0_OFFSET(%rcx) + movaps %xmm1, LRV_XMM1_OFFSET(%rcx) + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx3) + + /* This is to support AVX audit modules. */ + vmovdqu %ymm0, LRV_VECTOR0_OFFSET(%rcx) + vmovdqu %ymm1, LRV_VECTOR1_OFFSET(%rcx) + + /* Save xmm0/xmm1 registers to detect if they are changed + by audit module. */ + vmovdqa %xmm0, (LRV_SIZE)(%rcx) + vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx) + +L(no_avx3): +# endif + + fstpt LRV_ST0_OFFSET(%rcx) + fstpt LRV_ST1_OFFSET(%rcx) movq 24(%rbx), %rdx # La_x86_64_regs argument to %rdx. movq 40(%rbx), %rsi # Copy args pushed by PLT in register. movq 32(%rbx), %rdi # %rdi: link_map, %rsi: reloc_index call _dl_call_pltexit - movq (%rsp), %rax # Restore return registers. - movq 8(%rsp), %rdx - movaps 16(%rsp), %xmm0 - movaps 32(%rsp), %xmm1 - fldt 64(%rsp) - fldt 48(%rsp) + /* Restore return registers. */ + movq LRV_RAX_OFFSET(%rsp), %rax + movq LRV_RDX_OFFSET(%rsp), %rdx + + movaps LRV_XMM0_OFFSET(%rsp), %xmm0 + movaps LRV_XMM1_OFFSET(%rsp), %xmm1 + +# ifdef HAVE_AVX_SUPPORT + cmpl $0, L(have_avx)(%rip) + js L(no_avx4) + + /* Check if xmm0/xmm1 registers are changed by audit module. */ + vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2 + vpmovmskb %xmm2, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu LRV_VECTOR0_OFFSET(%rsp), %ymm0 + +1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2 + vpmovmskb %xmm2, %esi + cmpl $0xffff, %esi + je 1f + vmovdqu LRV_VECTOR1_OFFSET(%rsp), %ymm1 + +L(no_avx4): +1: +# endif + + fldt LRV_ST1_OFFSET(%rsp) + fldt LRV_ST0_OFFSET(%rsp) movq %rbx, %rsp - movq (%rsp), %rbx + movq (%rsp), %rbx cfi_restore(rbx) cfi_def_cfa_register(%rsp) diff --git a/sysdeps/x86_64/elf/configure b/sysdeps/x86_64/elf/configure index 774654997d..221e74c2b8 100755 --- a/sysdeps/x86_64/elf/configure +++ b/sysdeps/x86_64/elf/configure @@ -79,3 +79,28 @@ cat >>confdefs.h <<\_ACEOF #define PI_STATIC_AND_HIDDEN 1 _ACEOF + +{ $as_echo "$as_me:$LINENO: checking for AVX support" >&5 +$as_echo_n "checking for AVX support... " >&6; } +if test "${libc_cv_cc_avx+set}" = set; then + $as_echo_n "(cached) " >&6 +else + if { ac_try='${CC-cc} -mavx -xc /dev/null -S -o /dev/null' + { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5 + (eval $ac_try) 2>&5 + ac_status=$? + $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); }; }; then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi +fi +{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_avx" >&5 +$as_echo "$libc_cv_cc_avx" >&6; } +if test $libc_cv_cc_avx = yes; then + cat >>confdefs.h <<\_ACEOF +#define HAVE_AVX_SUPPORT 1 +_ACEOF + +fi diff --git a/sysdeps/x86_64/elf/configure.in b/sysdeps/x86_64/elf/configure.in index 9cb59d009c..14d1875302 100644 --- a/sysdeps/x86_64/elf/configure.in +++ b/sysdeps/x86_64/elf/configure.in @@ -32,3 +32,14 @@ fi dnl It is always possible to access static and hidden symbols in an dnl position independent way. AC_DEFINE(PI_STATIC_AND_HIDDEN) + +dnl Check if -mavx works. +AC_CACHE_CHECK(for AVX support, libc_cv_cc_avx, [dnl +if AC_TRY_COMMAND([${CC-cc} -mavx -xc /dev/null -S -o /dev/null]); then + libc_cv_cc_avx=yes +else + libc_cv_cc_avx=no +fi]) +if test $libc_cv_cc_avx = yes; then + AC_DEFINE(HAVE_AVX_SUPPORT) +fi diff --git a/sysdeps/x86_64/link-defines.sym b/sysdeps/x86_64/link-defines.sym new file mode 100644 index 0000000000..1694d883ad --- /dev/null +++ b/sysdeps/x86_64/link-defines.sym @@ -0,0 +1,28 @@ +#include "link.h" +#include <stddef.h> + +-- +VECTOR_SIZE sizeof (La_x86_64_vector) +XMM_SIZE sizeof (La_x86_64_xmm) + +LR_SIZE sizeof (struct La_x86_64_regs) +LR_RDX_OFFSET offsetof (struct La_x86_64_regs, lr_rdx) +LR_R8_OFFSET offsetof (struct La_x86_64_regs, lr_r8) +LR_R9_OFFSET offsetof (struct La_x86_64_regs, lr_r9) +LR_RCX_OFFSET offsetof (struct La_x86_64_regs, lr_rcx) +LR_RSI_OFFSET offsetof (struct La_x86_64_regs, lr_rsi) +LR_RDI_OFFSET offsetof (struct La_x86_64_regs, lr_rdi) +LR_RBP_OFFSET offsetof (struct La_x86_64_regs, lr_rbp) +LR_RSP_OFFSET offsetof (struct La_x86_64_regs, lr_rsp) +LR_XMM_OFFSET offsetof (struct La_x86_64_regs, lr_xmm) +LR_VECTOR_OFFSET offsetof (struct La_x86_64_regs, lr_vector) + +LRV_SIZE sizeof (struct La_x86_64_retval) +LRV_RAX_OFFSET offsetof (struct La_x86_64_retval, lrv_rax) +LRV_RDX_OFFSET offsetof (struct La_x86_64_retval, lrv_rdx) +LRV_XMM0_OFFSET offsetof (struct La_x86_64_retval, lrv_xmm0) +LRV_XMM1_OFFSET offsetof (struct La_x86_64_retval, lrv_xmm1) +LRV_ST0_OFFSET offsetof (struct La_x86_64_retval, lrv_st0) +LRV_ST1_OFFSET offsetof (struct La_x86_64_retval, lrv_st1) +LRV_VECTOR0_OFFSET offsetof (struct La_x86_64_retval, lrv_vector0) +LRV_VECTOR1_OFFSET offsetof (struct La_x86_64_retval, lrv_vector1) diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S new file mode 100644 index 0000000000..a9fe13ae58 --- /dev/null +++ b/sysdeps/x86_64/memcmp.S @@ -0,0 +1,359 @@ +/* memcmp with SSE2 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + + .text +ENTRY (memcmp) + test %rdx, %rdx + jz L(finz) + cmpq $1, %rdx + jle L(finr1b) + subq %rdi, %rsi + movq %rdx, %r10 + cmpq $32, %r10 + jge L(gt32) + /* Handle small chunks and last block of less than 32 bytes. */ +L(small): + testq $1, %r10 + jz L(s2b) + movzbl (%rdi), %eax + movzbl (%rdi, %rsi), %edx + subq $1, %r10 + je L(finz1) + addq $1, %rdi + subl %edx, %eax + jnz L(exit) +L(s2b): + testq $2, %r10 + jz L(s4b) + movzwl (%rdi), %eax + movzwl (%rdi, %rsi), %edx + subq $2, %r10 + je L(fin2_7) + addq $2, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s4b): + testq $4, %r10 + jz L(s8b) + movl (%rdi), %eax + movl (%rdi, %rsi), %edx + subq $4, %r10 + je L(fin2_7) + addq $4, %rdi + cmpl %edx, %eax + jnz L(fin2_7) +L(s8b): + testq $8, %r10 + jz L(s16b) + movq (%rdi), %rax + movq (%rdi, %rsi), %rdx + subq $8, %r10 + je L(fin2_7) + addq $8, %rdi + cmpq %rdx, %rax + jnz L(fin2_7) +L(s16b): + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + xorl %eax, %eax + subl $0xffff, %edx + jz L(finz) + bsfl %edx, %ecx + leaq (%rdi, %rcx), %rcx + movzbl (%rcx), %eax + movzbl (%rsi, %rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(finr1b): + movzbl (%rdi), %eax + movzbl (%rsi), %edx +L(finz1): + subl %edx, %eax +L(exit): + ret + + .p2align 4,, 4 +L(fin2_7): + cmpq %rdx, %rax + jz L(finz) + movq %rax, %r11 + subq %rdx, %r11 + bsfq %r11, %rcx + sarq $3, %rcx + salq $3, %rcx + sarq %cl, %rax + movzbl %al, %eax + sarq %cl, %rdx + movzbl %dl, %edx + subl %edx, %eax + ret + + .p2align 4,, 4 +L(finz): + xorl %eax, %eax + ret + + /* For blocks bigger than 32 bytes + 1. Advance one of the addr pointer to be 16B aligned. + 2. Treat the case of both addr pointers aligned to 16B + separately to avoid movdqu. + 3. Handle any blocks of greater than 64 consecutive bytes with + unrolling to reduce branches. + 4. At least one addr pointer is 16B aligned, use memory version + of pcmbeqb. + */ + .p2align 4,, 4 +L(gt32): + movq %rdx, %r11 + addq %rdi, %r11 + movq %rdi, %r8 + + andq $15, %r8 + jz L(16am) + /* Both pointers may be misaligned. */ + movdqu (%rdi), %xmm1 + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + subl $0xffff, %edx + jnz L(neq) + neg %r8 + leaq 16(%rdi, %r8), %rdi +L(16am): + /* Handle two 16B aligned pointers separately. */ + testq $15, %rsi + jz L(ATR) + testq $16, %rdi + jz L(A32) + movdqu (%rdi, %rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi +L(A32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + /* Pre-unroll to be ready for unrolled 64B loop. */ + testq $32, %rdi + jz L(A64) + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(A64): + movq %r11, %r10 + andq $-64, %r10 + cmpq %r10, %rdi + jge L(mt32) + +L(A64main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A64main) + +L(mt32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(A32main): + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqu (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %rdi, %r10 + jne L(A32main) +L(mt16): + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + + .p2align 4,, 4 +L(neq): + bsfl %edx, %ecx + movzbl (%rdi, %rcx), %eax + addq %rdi, %rsi + movzbl (%rsi,%rcx), %edx + jmp L(finz1) + + .p2align 4,, 4 +L(ATR): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + testq $16, %rdi + jz L(ATR32) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + je L(mt16) + +L(ATR32): + movq %r11, %r10 + andq $-64, %r10 + testq $32, %rdi + jz L(ATR64) + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + +L(ATR64): + cmpq %rdi, %r10 + je L(mt32) + +L(ATR64main): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + cmpq %rdi, %r10 + jne L(ATR64main) + + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi + jge L(mt16) + +L(ATR32res): + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + movdqa (%rdi,%rsi), %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %edx + subl $0xffff, %edx + jnz L(neq) + addq $16, %rdi + + cmpq %r10, %rdi + jne L(ATR32res) + + subq %rdi, %r11 + je L(finz) + movq %r11, %r10 + jmp L(small) + /* Align to 16byte to improve instruction fetch. */ + .p2align 4,, 4 +END(memcmp) + +#undef bcmp +weak_alias (memcmp, bcmp) +libc_hidden_builtin_def (memcmp) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 1c35e1ffb4..71e85f0652 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,5 +4,11 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) -sysdep_routines += strncmp-c +sysdep_routines += stpncpy-c strncpy-c strncmp-c +ifeq (yes,$(config-cflags-sse4)) +sysdep_routines += strcspn-c strpbrk-c strspn-c +CFLAGS-strcspn-c.c += -msse4 +CFLAGS-strpbrk-c.c += -msse4 +CFLAGS-strspn-c.c += -msse4 +endif endif diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S index 93ca631633..d4f265f430 100644 --- a/sysdeps/x86_64/multiarch/rawmemchr.S +++ b/sysdeps/x86_64/multiarch/rawmemchr.S @@ -77,6 +77,7 @@ __rawmemchr_sse42: # undef ENTRY # define ENTRY(name) \ .type __rawmemchr_sse2, @function; \ + .align 16; \ __rawmemchr_sse2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S new file mode 100644 index 0000000000..b63d308edc --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy.S @@ -0,0 +1,7 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c new file mode 100644 index 0000000000..2fde77dcab --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c @@ -0,0 +1,8 @@ +#define STPNCPY __stpncpy_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2); +#endif + +#include "stpncpy.c" diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S new file mode 100644 index 0000000000..ff89a89491 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy.S @@ -0,0 +1,6 @@ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S index 2f4bf17d95..37985036aa 100644 --- a/sysdeps/x86_64/multiarch/strcmp.S +++ b/sysdeps/x86_64/multiarch/strcmp.S @@ -1659,6 +1659,7 @@ LABEL(unaligned_table): # undef ENTRY # define ENTRY(name) \ .type STRCMP_SSE2, @function; \ + .align 16; \ STRCMP_SSE2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S new file mode 100644 index 0000000000..25cd01307d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -0,0 +1,1918 @@ +/* strcpy with SSSE3 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <ifunc-defines.h> + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define __GI_STRCPY __GI_stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define __GI_STRCPY __GI_strcpy +# endif +#endif + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCPY_SSE2(%rip), %rax + testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 3f +/* Avoid SSSE3 strcpy on Atom since it is slow. */ + cmpl $1, __cpu_features+KIND_OFFSET(%rip) + jne 2f + cmpl $6, __cpu_features+FAMILY_OFFSET(%rip) + jne 2f + cmpl $28, __cpu_features+MODEL_OFFSET(%rip) + jz 3f +2: leaq STRCPY_SSSE3(%rip), %rax +3: ret +END(STRCPY) + + .section .text.ssse3,"ax",@progbits +STRCPY_SSSE3: + cfi_startproc + CALL_MCOUNT + +/* + * This implementation uses SSE to copy up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCPY + test %rdx, %rdx + jz LABEL(strncpy_exitz) + mov %rdx, %r8 +#else + xor %edx, %edx +#endif + mov %esi, %ecx + and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/ + and $15, %ecx + mov %rdi, %rax /*store return parameter*/ + + + pxor %xmm0, %xmm0 /* clear %xmm0 */ + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + shr %cl, %edx /* get real bits left in edx*/ + test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */ + jnz LABEL(less16bytes) + +#ifdef USE_AS_STRNCPY + lea -16(%r8,%rcx), %r11 + cmp $0, %r11 + jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */ +#endif + + mov %rcx, %r9 + or %edi, %ecx + and $15, %ecx + lea -16(%r9), %r10 + jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/ + + neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/ + + pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/ + pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(less32bytes) + /* + * at least 16 byte available to fill destination rdi + */ +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(less32bytes_strncpy_truncation) +#endif + mov (%rsi, %r9), %rdx + mov %rdx, (%rdi) + mov 8(%rsi, %r9), %rdx + mov %rdx, 8(%rdi) + + /* + * so far destatination rdi may be aligned by 16, re-calculate rsi to jump + * crossponding case + * rcx is offset of rsi + * rax is offset of rdi + */ + + and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ + mov %rax, %rdx /* rax store orignal rdi */ + xor %rdi, %rdx /* equal to and $15, %rdx */ +#ifdef USE_AS_STRNCPY + add %rdx, %r8 +#endif + + add $16, %rdi /* next 16 bytes for rdi */ + sub %rdx, %r9 + + lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */ + mov %esi, %ecx /*store offset of rsi */ + and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ + + and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/ + jz LABEL(ashr_0) + + lea -16(%rcx), %r10 + mov %rcx, %r9 + neg %r10 + lea LABEL(unaligned_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + + /* + * The following cases will be handled by ashr_0 & ashr_0_start + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * 0 0 0 ashr_0 + * n(1~15) n(1~15) 0 ashr_0_start + * + */ + .p2align 5 +LABEL(ashr_0): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */ + movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */ + add $16, %rsi + add $16, %rdi + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + + test %edx, %edx /* edx must be 0 if there is no null char in rsi*/ + jnz LABEL(aligned_16bytes) + +LABEL(ashr_0_loop): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jz LABEL(ashr_0_loop) + + jmp LABEL(aligned_exit) + .p2align 4 + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_15): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_15_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_15_use_ssse3) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_14): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_14_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_14_use_ssse3) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_13): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_13_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_13_use_ssse3) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_12): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_12_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_12_use_ssse3) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_11): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_11_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_11_use_ssse3) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_10): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_10_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_10_use_ssse3) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_9): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_9_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $9, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $9, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_9_use_ssse3) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_8): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_8_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $8, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $8, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_8_use_ssse3) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_7): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + .p2align 4 + +LABEL(ashr_7_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $7, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $7, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_7_use_ssse3) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_6): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_6_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $6, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $6, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_6_use_ssse3) + + /* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_5): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_5_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $5, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $5, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_5_use_ssse3) + +/* + * + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_4): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_4_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $4, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $4, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_4_use_ssse3) + +/* + * + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_3): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_3_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $3, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $3, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_3_use_ssse3) + +/* + * + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_2): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_2_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $2, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $2, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_2_use_ssse3) + +/* + * + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_1): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_1_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $1, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + palignr $1, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_1_use_ssse3) + + .p2align 4 +LABEL(less32bytes): + xor %ecx, %ecx +LABEL(unaligned_exit): + add %r9, %rsi /* r9 stores original offset of rsi*/ + mov %rcx, %r9 + mov %r10, %rcx + shl %cl, %edx /* after shl, calculate the exact number to be filled*/ + mov %r9, %rcx + .p2align 4 +LABEL(aligned_exit): + add %rcx, %rdi /*locate exact address for rdi */ +LABEL(less16bytes): + add %rcx, %rsi /*locate exact address for rsi */ +LABEL(aligned_16bytes): +#ifdef USE_AS_STRNCPY + mov $1, %r9d + lea -1(%r8), %rcx + shl %cl, %r9d + cmp $32, %r8 + ja LABEL(strncpy_tail) + or %r9d, %edx +LABEL(strncpy_tail): +#endif + bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/ + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + +#ifdef USE_AS_STRNCPY + .p2align 4 +LABEL(less32bytes_strncpy_truncation): + xor %ecx, %ecx +LABEL(strncpy_truncation_unaligned): + add %r9, %rsi +LABEL(strncpy_truncation_aligned): + add %rcx, %rdi + add %rcx, %rsi + add $16, %r8 + lea -1(%r8), %rcx + lea LABEL(tail_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + .p2align 4 +LABEL(strncpy_exitz): + mov %rdi, %rax + ret +#endif + +#ifdef USE_AS_STRNCPY + .p2align 4 +LABEL(strncpy_fill_tail): + mov %rax, %rdx + movzx %cl, %rax + mov %r8, %rcx + add %rax, %rdi + xor %eax, %eax + shr $3, %ecx + jz LABEL(strncpy_fill_less_8) + + rep stosq +LABEL(strncpy_fill_less_8): + mov %r8, %rcx + and $7, %ecx + jz LABEL(strncpy_fill_return) +LABEL(strncpy_fill_less_7): + sub $1, %ecx + mov %al, (%rdi, %rcx) + jnz LABEL(strncpy_fill_less_7) +LABEL(strncpy_fill_return): +#ifdef USE_AS_STPCPY + cmpb $1, (%rdx) + sbb $-1, %rdx +#endif + mov %rdx, %rax + ret +#endif + .p2align 4 +LABEL(tail_0): + mov (%rsi), %cl + mov %cl, (%rdi) +#ifdef USE_AS_STPCPY + mov %rdi, %rax +#endif +#ifdef USE_AS_STRNCPY + mov $1, %cl + sub $1, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_1): + mov (%rsi), %cx + mov %cx, (%rdi) +#ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $2, %cl + sub $2, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_2): + mov (%rsi), %cx + mov %cx, (%rdi) + mov 1(%rsi), %cx + mov %cx, 1(%rdi) +#ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $3, %cl + sub $3, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_3): + mov (%rsi), %ecx + mov %ecx, (%rdi) +#ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $4, %cl + sub $4, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_4): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 1(%rsi), %edx + mov %edx, 1(%rdi) +#ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $5, %cl + sub $5, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_5): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 2(%rsi), %edx + mov %edx, 2(%rdi) +#ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $6, %cl + sub $6, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_6): + mov (%rsi), %ecx + mov %ecx, (%rdi) + mov 3(%rsi), %edx + mov %edx,3(%rdi) +#ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $7, %cl + sub $7, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_7): + mov (%rsi), %rcx + mov %rcx, (%rdi) +#ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $8, %cl + sub $8, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_8): + + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %edx + mov %edx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $9, %cl + sub $9, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_9): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %edx + mov %edx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $10, %cl + sub $10, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_10): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %edx + mov %edx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $11, %cl + sub $11, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_11): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %edx + mov %edx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $12, %cl + sub $12, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_12): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 5(%rsi), %rcx + mov %rcx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $13, %cl + sub $13, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_13): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 6(%rsi), %rcx + mov %rcx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $14, %cl + sub $14, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_14): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 7(%rsi), %rcx + mov %rcx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $15, %cl + sub $15, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + +LABEL(tail_15): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $16, %cl + sub $16, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_16): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cl + mov %cl, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $17, %cl + sub $17, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_17): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %cx + mov %cx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $18, %cl + sub $18, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_18): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %ecx + mov %ecx,15(%rdi) +#ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $19, %cl + sub $19, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_19): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %ecx + mov %ecx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $20, %cl + sub $20, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_20): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 13(%rsi), %rcx + mov %rcx, 13(%rdi) +#ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $21, %cl + sub $21, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_21): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 14(%rsi), %rcx + mov %rcx, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $22, %cl + sub $22, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_22): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 15(%rsi), %rcx + mov %rcx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $23, %cl + sub $23, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_23): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $24, %cl + sub $24, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_24): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %edx + mov %edx, 21(%rdi) +#ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $25, %cl + sub $25, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_25): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %edx + mov %edx, 22(%rdi) +#ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $26, %cl + sub $26, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_26): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %edx + mov %edx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $27, %cl + sub $27, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_27): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %edx + mov %edx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $28, %cl + sub $28, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + .p2align 4 +LABEL(tail_28): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 21(%rsi), %rdx + mov %rdx, 21(%rdi) +#ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $29, %cl + sub $29, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + .p2align 4 +LABEL(tail_29): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 22(%rsi), %rdx + mov %rdx, 22(%rdi) +#ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $30, %cl + sub $30, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + + ret + + + .p2align 4 +LABEL(tail_30): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 23(%rsi), %rdx + mov %rdx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $31, %cl + sub $31, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + + .p2align 4 +LABEL(tail_31): + mov (%rsi), %rcx + mov %rcx, (%rdi) + mov 8(%rsi), %rdx + mov %rdx, 8(%rdi) + mov 16(%rsi), %rcx + mov %rcx, 16(%rdi) + mov 24(%rsi), %rdx + mov %rdx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +#endif +#ifdef USE_AS_STRNCPY + mov $32, %cl + sub $32, %r8 + jnz LABEL(strncpy_fill_tail) +#ifdef USE_AS_STPCPY + cmpb $1, (%rax) + sbb $-1, %rax +#endif +#endif + ret + cfi_endproc + .size STRCPY_SSSE3, .-STRCPY_SSSE3 + + .p2align 4 + .section .rodata.ssse3,"a",@progbits +LABEL(tail_table): + .int LABEL(tail_0) - LABEL(tail_table) + .int LABEL(tail_1) - LABEL(tail_table) + .int LABEL(tail_2) - LABEL(tail_table) + .int LABEL(tail_3) - LABEL(tail_table) + .int LABEL(tail_4) - LABEL(tail_table) + .int LABEL(tail_5) - LABEL(tail_table) + .int LABEL(tail_6) - LABEL(tail_table) + .int LABEL(tail_7) - LABEL(tail_table) + .int LABEL(tail_8) - LABEL(tail_table) + .int LABEL(tail_9) - LABEL(tail_table) + .int LABEL(tail_10) - LABEL(tail_table) + .int LABEL(tail_11) - LABEL(tail_table) + .int LABEL(tail_12) - LABEL(tail_table) + .int LABEL(tail_13) - LABEL(tail_table) + .int LABEL(tail_14) - LABEL(tail_table) + .int LABEL(tail_15) - LABEL(tail_table) + .int LABEL(tail_16) - LABEL(tail_table) + .int LABEL(tail_17) - LABEL(tail_table) + .int LABEL(tail_18) - LABEL(tail_table) + .int LABEL(tail_19) - LABEL(tail_table) + .int LABEL(tail_20) - LABEL(tail_table) + .int LABEL(tail_21) - LABEL(tail_table) + .int LABEL(tail_22) - LABEL(tail_table) + .int LABEL(tail_23) - LABEL(tail_table) + .int LABEL(tail_24) - LABEL(tail_table) + .int LABEL(tail_25) - LABEL(tail_table) + .int LABEL(tail_26) - LABEL(tail_table) + .int LABEL(tail_27) - LABEL(tail_table) + .int LABEL(tail_28) - LABEL(tail_table) + .int LABEL(tail_29) - LABEL(tail_table) + .int LABEL(tail_30) - LABEL(tail_table) + .int LABEL(tail_31) - LABEL(tail_table) + + .p2align 4 +LABEL(unaligned_table): + .int LABEL(ashr_0) - LABEL(unaligned_table) + .int LABEL(ashr_1) - LABEL(unaligned_table) + .int LABEL(ashr_2) - LABEL(unaligned_table) + .int LABEL(ashr_3) - LABEL(unaligned_table) + .int LABEL(ashr_4) - LABEL(unaligned_table) + .int LABEL(ashr_5) - LABEL(unaligned_table) + .int LABEL(ashr_6) - LABEL(unaligned_table) + .int LABEL(ashr_7) - LABEL(unaligned_table) + .int LABEL(ashr_8) - LABEL(unaligned_table) + .int LABEL(ashr_9) - LABEL(unaligned_table) + .int LABEL(ashr_10) - LABEL(unaligned_table) + .int LABEL(ashr_11) - LABEL(unaligned_table) + .int LABEL(ashr_12) - LABEL(unaligned_table) + .int LABEL(ashr_13) - LABEL(unaligned_table) + .int LABEL(ashr_14) - LABEL(unaligned_table) + .int LABEL(ashr_15) - LABEL(unaligned_table) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCPY_SSE2, @function; \ + .align 16; \ + STRCPY_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcpy calls through a PLT. + The speedup we get from using SSSE3 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2 +# undef libc_hidden_def +# define libc_hidden_def(name) \ + .globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2 +#endif + +#ifndef USE_AS_STRNCPY +#include "../strcpy.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c new file mode 100644 index 0000000000..4512267d3f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn-c.c @@ -0,0 +1,312 @@ +/* strcspn with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <nmmintrin.h> +#include <string.h> + +/* We use 0x2: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_POSITIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any byte A and + the offset of the first byte. There are 3 cases: + + 1. The first 16byte data element has the byte A at the offset X. + 2. The first 16byte data element has EOS and doesn't have the byte A. + 3. The first 16byte data element is valid and doesn't have the byte A. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + 1 X 1 0/1 0 + 2 16 0 1 0 + 3 16 0 0 0 + + We exit from the loop for cases 1 and 2 with jbe which branches + when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset + X for case 1. */ + +#ifndef STRCSPN_SSE2 +# define STRCSPN_SSE2 __strcspn_sse2 +# define STRCSPN_SSE42 __strcspn_sse42 +#endif + +#ifdef USE_AS_STRPBRK +# define RETURN(val1, val2) return val1 +#else +# define RETURN(val1, val2) return val2 +#endif + +extern +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +STRCSPN_SSE2 (const char *, const char *); + + +#ifdef USE_AS_STRPBRK +char * +#else +size_t +#endif +__attribute__ ((section (".text.sse4.2"))) +STRCSPN_SSE42 (const char *s, const char *a) +{ + if (*a == 0) + RETURN (NULL, strlen (s)); + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + mask = _mm_srli_si128 (mask0, 1); + break; + case 2: + mask = _mm_srli_si128 (mask0, 2); + break; + case 3: + mask = _mm_srli_si128 (mask0, 3); + break; + case 4: + mask = _mm_srli_si128 (mask0, 4); + break; + case 5: + mask = _mm_srli_si128 (mask0, 5); + break; + case 6: + mask = _mm_srli_si128 (mask0, 6); + break; + case 7: + mask = _mm_srli_si128 (mask0, 7); + break; + case 8: + mask = _mm_srli_si128 (mask0, 8); + break; + case 9: + mask = _mm_srli_si128 (mask0, 9); + break; + case 10: + mask = _mm_srli_si128 (mask0, 10); + break; + case 11: + mask = _mm_srli_si128 (mask0, 11); + break; + case 12: + mask = _mm_srli_si128 (mask0, 12); + break; + case 13: + mask = _mm_srli_si128 (mask0, 13); + break; + case 14: + mask = _mm_srli_si128 (mask0, 14); + break; + case 15: + mask = _mm_srli_si128 (mask0, 15); + break; + } + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return STRCSPN_SSE2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. */ + switch (offset) + { + case 1: + mask = _mm_alignr_epi8 (mask1, mask0, 1); + break; + case 2: + mask = _mm_alignr_epi8 (mask1, mask0, 2); + break; + case 3: + mask = _mm_alignr_epi8 (mask1, mask0, 3); + break; + case 4: + mask = _mm_alignr_epi8 (mask1, mask0, 4); + break; + case 5: + mask = _mm_alignr_epi8 (mask1, mask0, 5); + break; + case 6: + mask = _mm_alignr_epi8 (mask1, mask0, 6); + break; + case 7: + mask = _mm_alignr_epi8 (mask1, mask0, 7); + break; + case 8: + mask = _mm_alignr_epi8 (mask1, mask0, 8); + break; + case 9: + mask = _mm_alignr_epi8 (mask1, mask0, 9); + break; + case 10: + mask = _mm_alignr_epi8 (mask1, mask0, 10); + break; + case 11: + mask = _mm_alignr_epi8 (mask1, mask0, 11); + break; + case 12: + mask = _mm_alignr_epi8 (mask1, mask0, 12); + break; + case 13: + mask = _mm_alignr_epi8 (mask1, mask0, 13); + break; + case 14: + mask = _mm_alignr_epi8 (mask1, mask0, 14); + break; + case 15: + mask = _mm_alignr_epi8 (mask1, mask0, 15); + break; + } + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return STRCSPN_SSE2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + + int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ + int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x2); + int cflag = _mm_cmpistrc (mask, value, 0x2); + int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) + RETURN (NULL, + /* Find where the NULL terminator is. */ + (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s)); + aligned += 16; + } +} diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S new file mode 100644 index 0000000000..cc75ab70e6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcspn.S @@ -0,0 +1,82 @@ +/* Multiple versions of strcspn + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> + +#ifdef HAVE_SSE4_SUPPORT + +#include <sysdep.h> +#include <ifunc-defines.h> + +#ifdef USE_AS_STRPBRK +#define STRCSPN_SSE42 __strpbrk_sse42 +#define STRCSPN_SSE2 __strpbrk_sse2 +#define __GI_STRCSPN __GI_strpbrk +#else +#ifndef STRCSPN +#define STRCSPN strcspn +#define STRCSPN_SSE42 __strcspn_sse42 +#define STRCSPN_SSE2 __strcspn_sse2 +#define __GI_STRCSPN __GI_strcspn +#endif +#endif + +/* Define multiple versions only for the definition in libc. Don't + define multiple versions for strpbrk in static library since we + need strpbrk before the initialization happened. */ +#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc + .text +ENTRY(STRCSPN) + .type STRCSPN, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCSPN_SSE2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq STRCSPN_SSE42(%rip), %rax +2: ret +END(STRCSPN) + +# undef ENTRY +# define ENTRY(name) \ + .type STRCSPN_SSE2, @function; \ + .globl STRCSPN_SSE2; \ + .align 16; \ + STRCSPN_SSE2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strcspn calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2 +#endif + +#endif /* HAVE_SSE4_SUPPORT */ + +#ifdef USE_AS_STRPBRK +#include "../strpbrk.S" +#else +#include "../strcspn.S" +#endif diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S index 79e6a977ec..82b03ccc28 100644 --- a/sysdeps/x86_64/multiarch/strlen.S +++ b/sysdeps/x86_64/multiarch/strlen.S @@ -77,6 +77,7 @@ __strlen_sse42: # undef ENTRY # define ENTRY(name) \ .type __strlen_sse2, @function; \ + .align 16; \ __strlen_sse2: cfi_startproc; \ CALL_MCOUNT # undef END diff --git a/sysdeps/x86_64/multiarch/strncpy-c.c b/sysdeps/x86_64/multiarch/strncpy-c.c new file mode 100644 index 0000000000..296c32cb5d --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-c.c @@ -0,0 +1,8 @@ +#define STRNCPY __strncpy_sse2 +#ifdef SHARED +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strncpy_sse2, __GI_strncpy, __strncpy_sse2); +#endif + +#include "strncpy.c" diff --git a/sysdeps/x86_64/multiarch/strncpy.S b/sysdeps/x86_64/multiarch/strncpy.S new file mode 100644 index 0000000000..327a4ce447 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy.S @@ -0,0 +1,3 @@ +#define STRCPY strncpy +#define USE_AS_STRNCPY +#include "strcpy.S" diff --git a/sysdeps/x86_64/multiarch/strpbrk-c.c b/sysdeps/x86_64/multiarch/strpbrk-c.c new file mode 100644 index 0000000000..c58dcb5605 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk-c.c @@ -0,0 +1,4 @@ +#define USE_AS_STRPBRK +#define STRCSPN_SSE2 __strpbrk_sse2 +#define STRCSPN_SSE42 __strpbrk_sse42 +#include "strcspn-c.c" diff --git a/sysdeps/x86_64/multiarch/strpbrk.S b/sysdeps/x86_64/multiarch/strpbrk.S new file mode 100644 index 0000000000..ed5bca6a94 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strpbrk.S @@ -0,0 +1,3 @@ +#define STRCSPN strpbrk +#define USE_AS_STRPBRK +#include "strcspn.S" diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c new file mode 100644 index 0000000000..5b99f0d383 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn-c.c @@ -0,0 +1,284 @@ +/* strspn with SSE4.2 intrinsics + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <nmmintrin.h> +#include <string.h> + +/* We use 0x12: + _SIDD_SBYTE_OPS + | _SIDD_CMP_EQUAL_ANY + | _SIDD_NEGATIVE_POLARITY + | _SIDD_LEAST_SIGNIFICANT + on pcmpistri to compare xmm/mem128 + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + X X X X X X X X X X X X X X X X + + against xmm + + 0 1 2 3 4 5 6 7 8 9 A B C D E F + A A A A A A A A A A A A A A A A + + to find out if the first 16byte data element has any non-A byte and + the offset of the first byte. There are 2 cases: + + 1. The first 16byte data element has the non-A byte, including + EOS, at the offset X. + 2. The first 16byte data element is valid and doesn't have the non-A + byte. + + Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases: + + case ECX CFlag ZFlag SFlag + 1 X 1 0/1 0 + 2 16 0 0 0 + + We exit from the loop for case 1. */ + +extern size_t __strspn_sse2 (const char *, const char *); + + +size_t +__attribute__ ((section (".text.sse4.2"))) +__strspn_sse42 (const char *s, const char *a) +{ + if (*a == 0) + return 0; + + const char *aligned; + __m128i mask; + int offset = (int) ((size_t) a & 15); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + mask = _mm_srli_si128 (mask0, 1); + break; + case 2: + mask = _mm_srli_si128 (mask0, 2); + break; + case 3: + mask = _mm_srli_si128 (mask0, 3); + break; + case 4: + mask = _mm_srli_si128 (mask0, 4); + break; + case 5: + mask = _mm_srli_si128 (mask0, 5); + break; + case 6: + mask = _mm_srli_si128 (mask0, 6); + break; + case 7: + mask = _mm_srli_si128 (mask0, 7); + break; + case 8: + mask = _mm_srli_si128 (mask0, 8); + break; + case 9: + mask = _mm_srli_si128 (mask0, 9); + break; + case 10: + mask = _mm_srli_si128 (mask0, 10); + break; + case 11: + mask = _mm_srli_si128 (mask0, 11); + break; + case 12: + mask = _mm_srli_si128 (mask0, 12); + break; + case 13: + mask = _mm_srli_si128 (mask0, 13); + break; + case 14: + mask = _mm_srli_si128 (mask0, 14); + break; + case 15: + mask = _mm_srli_si128 (mask0, 15); + break; + } + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16 - offset) + { + /* There is no NULL terminator. */ + __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); + int index = _mm_cmpistri (mask1, mask1, 0x3a); + length += index; + + /* Don't use SSE4.2 if the length of A > 16. */ + if (length > 16) + return __strspn_sse2 (s, a); + + if (index != 0) + { + /* Combine mask0 and mask1. */ + switch (offset) + { + case 1: + mask = _mm_alignr_epi8 (mask1, mask0, 1); + break; + case 2: + mask = _mm_alignr_epi8 (mask1, mask0, 2); + break; + case 3: + mask = _mm_alignr_epi8 (mask1, mask0, 3); + break; + case 4: + mask = _mm_alignr_epi8 (mask1, mask0, 4); + break; + case 5: + mask = _mm_alignr_epi8 (mask1, mask0, 5); + break; + case 6: + mask = _mm_alignr_epi8 (mask1, mask0, 6); + break; + case 7: + mask = _mm_alignr_epi8 (mask1, mask0, 7); + break; + case 8: + mask = _mm_alignr_epi8 (mask1, mask0, 8); + break; + case 9: + mask = _mm_alignr_epi8 (mask1, mask0, 9); + break; + case 10: + mask = _mm_alignr_epi8 (mask1, mask0, 10); + break; + case 11: + mask = _mm_alignr_epi8 (mask1, mask0, 11); + break; + case 12: + mask = _mm_alignr_epi8 (mask1, mask0, 12); + break; + case 13: + mask = _mm_alignr_epi8 (mask1, mask0, 13); + break; + case 14: + mask = _mm_alignr_epi8 (mask1, mask0, 14); + break; + case 15: + mask = _mm_alignr_epi8 (mask1, mask0, 15); + break; + } + } + } + } + else + { + /* A is aligned. */ + mask = _mm_load_si128 ((__m128i *) a); + + /* Find where the NULL terminator is. */ + int length = _mm_cmpistri (mask, mask, 0x3a); + if (length == 16) + { + /* There is no NULL terminator. Don't use SSE4.2 if the length + of A > 16. */ + if (a[16] != 0) + return __strspn_sse2 (s, a); + } + } + + offset = (int) ((size_t) s & 15); + if (offset != 0) + { + /* Check partial string. */ + aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + switch (offset) + { + case 1: + value = _mm_srli_si128 (value, 1); + break; + case 2: + value = _mm_srli_si128 (value, 2); + break; + case 3: + value = _mm_srli_si128 (value, 3); + break; + case 4: + value = _mm_srli_si128 (value, 4); + break; + case 5: + value = _mm_srli_si128 (value, 5); + break; + case 6: + value = _mm_srli_si128 (value, 6); + break; + case 7: + value = _mm_srli_si128 (value, 7); + break; + case 8: + value = _mm_srli_si128 (value, 8); + break; + case 9: + value = _mm_srli_si128 (value, 9); + break; + case 10: + value = _mm_srli_si128 (value, 10); + break; + case 11: + value = _mm_srli_si128 (value, 11); + break; + case 12: + value = _mm_srli_si128 (value, 12); + break; + case 13: + value = _mm_srli_si128 (value, 13); + break; + case 14: + value = _mm_srli_si128 (value, 14); + break; + case 15: + value = _mm_srli_si128 (value, 15); + break; + } + + int length = _mm_cmpistri (mask, value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ + int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + return length; + aligned += 16; + } + else + aligned = s; + + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); + int index = _mm_cmpistri (mask, value, 0x12); + int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; + } +} diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S new file mode 100644 index 0000000000..4183a2cf60 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strspn.S @@ -0,0 +1,63 @@ +/* Multiple versions of strspn + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <config.h> + +#ifdef HAVE_SSE4_SUPPORT + +#include <sysdep.h> +#include <ifunc-defines.h> + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(strspn) + .type strspn, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq __strspn_sse2(%rip), %rax + testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 2f + leaq __strspn_sse42(%rip), %rax +2: ret +END(strspn) + +# undef ENTRY +# define ENTRY(name) \ + .type __strspn_sse2, @function; \ + .globl __strspn_sse2; \ + .align 16; \ + __strspn_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __strspn_sse2, .-__strspn_sse2 +# undef libc_hidden_builtin_def +/* It doesn't make sense to send libc-internal strspn calls through a PLT. + The speedup we get from using SSE4.2 instruction is likely eaten away + by the indirect call in the PLT. */ +# define libc_hidden_builtin_def(name) \ + .globl __GI_strspn; __GI_strspn = __strspn_sse2 +#endif + +#endif /* HAVE_SSE4_SUPPORT */ + +#include "../strspn.S" |