diff options
92 files changed, 4418 insertions, 1581 deletions
diff --git a/ChangeLog b/ChangeLog index 4b5cd019e1..0cb533b2c8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -229,6 +229,384 @@ bits/wordsize.h. (sigaction): Declare __glibc_reserved0 only when __WORDSIZE is 64. +2015-09-01 Paul E. Murphy <murphyp@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc32/sysdep.h (ABORT_TRANSACTION): Use + register other than r0 for tabort, it has special meaning. + * sysdeps/powerpc/powerpc64/sysdep.h (ABORT_TRANSACTION): Likewise + * sysdeps/unix.sysv/linux/powerpc/syscall.S (syscall): Abort + transaction before starting syscall. + +2015-09-01 Paul E. Murphy <murphyp@linux.vnet.ibm.com> + + * sysdeps/unix/sysv/linux/powerpc/elision-lock.c + (__arch_compare_and_exchange_val_32_acq): Remove and use common + definition. ISA 2.07B no longer requires full sync. + +2015-05-11 Andreas Schwab <schwab@suse.de> + + [BZ #18007] + * nis/nss_compat/compat-grp.c (internal_endgrent): Don't call + nss_endgrent. + (_nss_compat_endgrent): Call nss_endgrent. + * nis/nss_compat/compat-pwd.c (internal_endpwent): Don't call + nss_endpwent. + (_nss_compat_endpwent): Call nss_endpwent. + * nis/nss_compat/compat-spwd.c (internal_setspent): Add parameter + needent, call nss_setspent only if non-zero. + (_nss_compat_setspent, _nss_compat_getspent_r): Pass non-zero. + (internal_endspent): Don't call nss_endspent. + (_nss_compat_endspent): Call nss_endspent. + * nss/nss_files/files-XXX.c (position, last_use, keep_stream): + Remove. All uses removed. + (internal_setent): Remove parameter stayopen, add parameter + stream. Use it instead of global variable. + (CONCAT(_nss_files_set,ENTNAME)): Pass global stream. + (internal_endent, internal_getent): Add parameter stream. Use it + instead of global variable. + (CONCAT(_nss_files_end,ENTNAME)) + (CONCAT(_nss_files_get,ENTNAME_r)): Pass global stream. + (_nss_files_get##name##_r): Pass local stream. Remove locking. + * nss/nss_files/files-alias.c (position, last_use): Remove. All + uses removed. + (internal_setent, internal_endent): Add parameter stream. Use it + instead of global variable. + (_nss_files_setaliasent, _nss_files_endaliasent): Pass global + stream. + (get_next_alias): Add parameter stream. + (_nss_files_getaliasent_r): Pass global stream. + (_nss_files_getaliasbyname_r): Pass local stream. Remove locking. + * nss/nss_files/files-hosts.c (_nss_files_gethostbyname3_r) + (_nss_files_gethostbyname4_r): Pass local stream to + internal_setent, internal_getent and internal_endent. Remove + locking. + +2015-04-29 Florian Weimer <fweimer@redhat.com> + + [BZ #18007] + * nss/nss_files/files-XXX.c (CONCAT): Always enable stayopen. + (CVE-2014-8121) + * nss/tst-nss-getpwent.c: New file. + * nss/Makefile (tests): Add new test. + +2015-04-21 Arjun Shankar <arjun.is@lostca.se> + + [BZ #18287] + * resolv/nss_dns/dns-host.c (getanswer_r): Adjust buffer length + based on padding. (CVE-2015-1781) + +2015-03-10 Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/ieee754/dbl-64/Makefile (CFLAGS-e_pow.c): Add + $(config-cflags-nofma). + +2015-02-23 Paul Pluzhnikov <ppluzhnikov@google.com> + + [BZ #16618] + * stdio-common/tst-sscanf.c (main): Test for buffer overflow. + * stdio-common/vfscanf.c (_IO_vfscanf_internal): Compute needed + size in bytes. Store needed elements in wpmax. Use needed size + in bytes for extend_alloca. + +2015-02-12 Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/unix/sysv/linux/powerpc/htm.h [TABORT]: Fix encoding for + little endian. + +2014-01-20 Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/powerpc/nptl/tls.h (tcbhead_t): Add tm_capable field. + (TLS_INIT_TP): Add tm_capable initialization. + (TLS_DEFINE_INIT_TP): Likewise. + (THREAD_GET_TM_CAPABLE): New file: get tm_capable field value from + TCB. + (THREAD_SET_TM_CAPABLE): New file: set tm_capable field value in TCB. + * sysdeps/powerpc/nptl/tcb-offsets.sym (TM_CAPABLE): Add field offset + calculation. + * sysdeps/powerpc/powerpc32/sysdep.h (DO_CALL): Abort hardware + transactoion is lock elision is built and TCB tm_capable is set. + * sysdeps/powerpc/powerpc64/sysdep.h (DO_CALL): Likewise. + * sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h + (INTERNAL_SYSCALL_NCS): Likewise. + * sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h + (INTERNAL_SYSCALL_NCS): Likewise. + * sysdeps/powerpc/sysdep.h (ABORT_TRANSACTION): New define. + + * sysdeps/powerpc/nptl/elide.h: New file: generic lock elision support + for powerpc. + * sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h + [pthread_rwlock_t] (__pad1): Change size to 7 bytes in 64 bits case + and remove it for 32 bits case. + [pthread_rwlock_t] (__rwelision): New field for lock elision. + (__PTHREAD_RWLOCK_ELISION_EXTRA): Adjust for new lock elision field + initialization. + * sysdeps/unix/sysv/linux/powerpc/elision-conf.c (elision_init): + Disable lock elision with rdlocks if elision is not available. + + * sysdeps/unix/sysv/linux/powerpc/Makefile [nptl] + (sysdep_routines): Add lock elision objects. + * sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h + [pthread_mutex_t] (__spins): Rework to add lock elision field. + [pthread_mutex_t] (__elision): Add field. + [__PTHREAD_SPINS]: Adjust to init lock elision field. + * sysdeps/unix/sysv/linux/powerpc/elision-conf.c: New file: lock + elision definitions for powerpc. + * sysdeps/unix/sysv/linux/powerpc/elision-lock.c: New file: + implementation of lock elision for powerpc. + * sysdeps/unix/sysv/linux/powerpc/elision-timed.c: New file: + implementation of timed lock elision for powerpc. + * sysdeps/unix/sysv/linux/powerpc/elision-trylock.c: New file: + implementation of trylock with lock elision for powerpc. + * sysdeps/unix/sysv/linux/powerpc/elision-unlock.c: New file: + implementaion of unlock for lock elision for powerpc. + * sysdeps/unix/sysv/linux/powerpc/force-elision.h: New file: + automatic enable lock elision for mutexes. + * sysdeps/unix/sysv/linux/powerpc/htm.h: New file: hardware + transaction execution definitions for powerpc. + * sysdeps/unix/sysv/linux/powerpc/lowlevellock.h: New file: add TLE + definitions. + * sysdeps/unix/sysv/linux/powerpc/pthread_mutex_cond_lock.c: New file. + * sysdeps/unix/sysv/linux/powerpc/pthread_mutex_lock.c: Likewise. + * sysdeps/unix/sysv/linux/powerpc/pthread_mutex_timedlock.c: Likewise. + * sysdeps/unix/sysv/linux/powerpc/pthread_mutex_trylock.c: Likewise. + * NEWS: Update. + +2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc64/power7/memcmp.S (memcmp): Fix performance + regression on LE. + + * sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S: New file. + * sysdeps/powerpc/powerpc64/power8/strncmp.S: New file. + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add + strncmp-power8 object. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Add __strncmp_power8 implementation. + * sysdeps/powerpc/powerpc64/multiarch/strncmp.c (strncmp): Likewise. + * NEWS: Update. + +2015-01-13 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> + Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc64/power7/strcmp.S (strcmp): Optimize + trailing byte check. + +2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: + Add strcmp-power8 object. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Add __strcmp_power8 implementation. + * sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S: New file. + * sysdeps/powerpc/powerpc64/multiarch/strcmp.c (strcmp): Add + __strcmp_power8 implementation. + * sysdeps/powerpc/powerpc64/power8/strcmp.S: New file. + * NEWS: Update. + + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: + Add strncpy-power8 and stpncpy-power8 objects. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8 + implementations. + * sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S: New file. + * sysdeps/powerpc/powerpc64/multiarch/stpncpy.c (__stpncpy): Add + __stpncpy_power8 implementation. + * sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S: New file. + * sysdeps/powerpc/powerpc64/multiarch/strncpy.c (strncpy): Add + __strncpy_power8 implementation. + * sysdeps/powerpc/powerpc64/power8/stpncpy.S: New file. + * sysdeps/powerpc/powerpc64/power8/strncpy.S: New file. + * NEWS: Update. + + * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file. + * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file. + * sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise. + + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add + strncat-power8 object. + * sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add + __strcat_power8 implementation. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Add __strcat_power8 implementation. + * sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c: New file: + optimized strcat for power8. + + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add + strcpy-power8 and stpcpy-power8 objects. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Add __strcpy_power8 and __stpcpy_power8 + implementations. + * sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S: New file: + multiarch stpcpy implementation for POWER8. + * sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S: New file; + multiarch strcpy implementation for POWER8. + * sysdeps/powerpc/powerpc64/multiarch/strcpy.c (strcpy): Add + __strcpy_power8 function. + * sysdeps/powerpc/powerpc64/power8/stpcpy.S: New file: optimized + stpcpy for POWER8. + * sysdeps/powerpc/powerpc64/power8/strcpy.S: New file: optimized + strcpy for POWER8. + * NEWS: Update. + +2014-12-31 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> + Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc64/power7/strcpy.S (strcpy): Optimize unaligned + path. + * benchtests/bench-strcpy.c (test_main): Add more unaligned inputs. + +2014-12-16 Florian Weimer <fweimer@redhat.com> + + [BZ #17630] + * resolv/nss_dns/dns-network.c (getanswer_r): Iterate over alias + names. + +2014-12-15 Jeff Law <law@redhat.com> + + [BZ #16617] + * stdio-common/vfprintf.c (vfprintf): Allocate large specs array + on the heap. (CVE-2012-3406) + * stdio-common/bug23-2.c, stdio-common/bug23-3.c: New file. + * stdio-common/bug23-4.c: New file. Test case by Joseph Myers. + * stdio-common/Makefile (tests): Add bug23-2, bug23-3, bug23-4. + +2014-12-02 Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: + Remove strpbrk objects. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Remove strpbrk implementation. + * sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c: Remove file. + * sysdeps/powerpc/powerpc64/multiarch/strpbrk.c: Remove file. + * sysdeps/powerpc/powerpc64/power7/strpbrk.S: Remove file. + * sysdeps/powerpc/powerpc64/strpbrk.S: New file. + + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: + Remove strcspn objects. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Remove strcspn implementation. + * sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c: Remove file. + * sysdeps/powerpc/powerpc64/multiarch/strcspn.c: Remove file. + * sysdeps/powerpc/powerpc64/power7/strcspn.S: Remove file. + * sysdeps/powerpc/powerpc64/strcspn.S: New file. + + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: + Remove strspn objetcs. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Remove strspn implementation. + * sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S: Remove file. + * sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c: Likewise. + * sysdeps/powerpc/powerpc64/power7/strspn.S: Remove file. + * sysdeps/powerpc/powerpc64/strspn.S: New file. + +2014-12-01 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc64/strtok.S: New file. + * sysdeps/powerpc/powerpc64/strtok_r.S: New file. + +2014-11-26 Adhemerval Zanella <azanella@linux.ibm.com> + + * csu/tst-atomic.c (do_test): Add atomic_exchange_and_add_{acq,rel} + tests. + * sysdeps/powerpc/bits/atomic.h + (__arch_atomic_exchange_and_add_32_acq): Add definition. + (__arch_atomic_exchange_and_add_32_rel): Likewise. + (atomic_exchange_and_add_acq): Likewise. + (atomic_exchange_and_add_rel): Likewise. + * sysdeps/powerpc/powerpc32/bits/atomic.h + (__arch_atomic_exchange_and_add_64_acq): Add definition. + (__arch_atomic_exchange_and_add_64_rel): Likewise. + * sysdeps/powerpc/powerpc64/bits/atomic.h + (__arch_atomic_exchange_and_add_64_acq): Add definition. + (__arch_atomic_exchange_and_add_64_rel): Likewise. + +2014-11-25 Anton Blanchard <anton@samba.org> + + * sysdeps/powerpc/bits/atomic.h + (__arch_compare_and_exchange_bool_64_rel): Load from mem. + +2014-11-19 Carlos O'Donell <carlos@redhat.com> + Florian Weimer <fweimer@redhat.com> + Joseph Myers <joseph@codesourcery.com> + Adam Conrad <adconrad@0c3.net> + Andreas Schwab <schwab@suse.de> + Brooks <bmoses@google.com> + + [BZ #17625] + * wordexp-test.c (__dso_handle): Add prototype. + (__register_atfork): Likewise. + (__app_register_atfork): New function. + (registered_forks): New global. + (register_fork): New function. + (test_case): Add 3 new tests for WRDE_CMDSUB. + (main): Call __app_register_atfork. + (testit): If WRDE_NOCMD set registered_forks to zero, run test, and if + fork count is non-zero fail the test. + * posix/wordexp.c (exec_comm): Return WRDE_CMDSUB if WRDE_NOCMD flag + is set. + (parse_dollars): Remove check for WRDE_NOCMD. + (parse_dquote): Likewise. + +2014-11-05 Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Simplify + definition. + * sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S (MFVSRD_R3_V1): + Likwise. + * sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S (MFVSRD_R3_V1): + Likewise. + * sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S (MFVSRD_R3_V1): + Likewise. + * sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S (MFVSRD_R3_V1): + Likewise. + * sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S (MFVSRD_R3_V1): + Likewise. + +2014-11-03 Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Encode + mtvsrd instruction in binary form. + +2014-10-31 Torvald Riegel <triegel@redhat.com> + + * sysdeps/powerpc/bits/atomic.h (atomic_write_barrier): Remove and... + * sysdeps/powerpc/powerpc32/bits/atomic.h (atomic_write_barrier): + ... add here and use lwsync or sync ... + * sysdeps/powerpc/powerpc64/bits/atomic.h (atomic_write_barrier): + ... and add here using lwsync. + +2014-09-10 Adhemerval Zanella <azanella@linux.vnet.ibm.com> + + * benchtests/bench-memset.c (test_main): Add more test from size + from 32 to 512 bytes. + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: + Add POWER8 memset object. + * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Add POWER8 memset and bzero implementations. + * sysdeps/powerpc/powerpc64/multiarch/bzero.c (__bzero): Add POWER8 + implementation. + * sysdeps/powerpc/powerpc64/multiarch/memset.c (__libc_memset): + Likewise. + * sysdeps/powerpc/powerpc64/multiarch/memset-power8.S: New file: + multiarch POWER8 memset optimization. + * sysdeps/powerpc/powerpc64/power8/memset.S: New file: optimized + POWER8 memset optimization. + + * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: + Remove bzero multiarch objects. + * sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S: Remove file. + * sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S: Likewise. + * sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S: Likewise. + * sysdeps/powerpc/powerpc64/multiarch/memset-power4.S [NO_BZERO_IMPL]: + Remove define. + [__bzero]: Redefine to specific name. + * sysdeps/powerpc/powerpc64/multiarch/memset-power6.S: Likewise. + * sysdeps/powerpc/powerpc64/multiarch/memset-power7.S: Likewise. + * sysdeps/powerpc/powerpc64/power4/memset.S [NO_BZERO_IMPL]: Remove + define. + * sysdeps/powerpc/powerpc64/power6/memset.S: Likewise. + * sysdeps/powerpc/powerpc64/power7/memset.S: Likewise. + 2015-02-16 Paul Pluzhnikov <ppluzhnikov@google.com> [BZ #16618] diff --git a/NEWS b/NEWS index dc8679f8ca..6181729b61 100644 --- a/NEWS +++ b/NEWS @@ -10,8 +10,8 @@ Version 2.20.1 * The following bugs are resolved with this release: 16009, 16617, 16618, 17266, 17269, 17370, 17371, 17460, 17485, 17555, - 17625, 17630, 17801, 17905, 18032, 18080, 18240, 18508, 18665, 18694, - 18928, 19018, 19682. + 17625, 17630, 17801, 17905, 18007, 18032, 18080, 18240, 18287, 18508, + 18665, 18694, 18928, 19018, 19682. * The glob function suffered from a stack-based buffer overflow when it was called with the GLOB_ALTDIRFUNC flag and encountered a long file name. @@ -43,6 +43,28 @@ Version 2.20.1 * The LD_POINTER_GUARD environment variable can no longer be used to disable the pointer guard feature. It is always enabled. +* A buffer overflow in gethostbyname_r and related functions performing DNS + requests has been fixed. If the NSS functions were called with a + misaligned buffer, the buffer length change due to pointer alignment was + not taken into account. This could result in application crashes or, + potentially arbitrary code execution, using crafted, but syntactically + valid DNS responses. (CVE-2015-1781) + +* CVE-2014-8121 The NSS backends shared internal state between the getXXent + and getXXbyYY NSS calls for the same database, causing a denial-of-service + condition in some applications. + +* Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64 + and powerpc64le. This may improve lock scaling of existing programs on + HTM capable systems. The lock elision code is only enabled with + --enable-lock-elision=yes. Also, the TSX lock elision implementation for + powerpc will issue a transaction abort on every syscall to avoid side + effects being visible outside transactions. + +* Optimized strcpy, stpcpy, strncpy, stpncpy, strcmp, and strncmp + implementations for powerpc64/powerpc64le. + Implemented by Adhemerval Zanella (IBM). + * CVE-2015-1472 Under certain conditions wscanf can allocate too little memory for the to-be-scanned arguments and overflow the allocated buffer. The implementation now correctly computes the required buffer diff --git a/benchtests/bench-memset.c b/benchtests/bench-memset.c index 5304113e3d..20265936b9 100644 --- a/benchtests/bench-memset.c +++ b/benchtests/bench-memset.c @@ -150,6 +150,11 @@ test_main (void) if (i & (i - 1)) do_test (0, c, i); } + for (i = 32; i < 512; i+=32) + { + do_test (0, c, i); + do_test (i, c, i); + } do_test (1, c, 14); do_test (3, c, 1024); do_test (4, c, 64); diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c index c3ab4cfcf7..e9445f290f 100644 --- a/benchtests/bench-strcpy.c +++ b/benchtests/bench-strcpy.c @@ -171,6 +171,22 @@ test_main (void) do_test (i, i, 8 << i, BIG_CHAR); } + for (i = 16; i <= 512; i+=4) + { + do_test (0, 4, i, SMALL_CHAR); + do_test (4, 0, i, BIG_CHAR); + do_test (4, 4, i, SMALL_CHAR); + do_test (2, 2, i, BIG_CHAR); + do_test (2, 6, i, SMALL_CHAR); + do_test (6, 2, i, BIG_CHAR); + do_test (1, 7, i, SMALL_CHAR); + do_test (7, 1, i, BIG_CHAR); + do_test (3, 4, i, SMALL_CHAR); + do_test (4, 3, i, BIG_CHAR); + do_test (5, 7, i, SMALL_CHAR); + do_test (7, 5, i, SMALL_CHAR); + } + return ret; } diff --git a/csu/tst-atomic.c b/csu/tst-atomic.c index d16c66dc31..ab6db45307 100644 --- a/csu/tst-atomic.c +++ b/csu/tst-atomic.c @@ -113,6 +113,22 @@ do_test (void) ret = 1; } + mem = 2; + if (atomic_exchange_and_add_acq (&mem, 11) != 2 + || mem != 13) + { + puts ("atomic_exchange_and_add test failed"); + ret = 1; + } + + mem = 2; + if (atomic_exchange_and_add_rel (&mem, 11) != 2 + || mem != 13) + { + puts ("atomic_exchange_and_add test failed"); + ret = 1; + } + mem = -21; atomic_add (&mem, 22); if (mem != 1) diff --git a/elf/get-dynamic-info.h b/elf/get-dynamic-info.h index 20ccf30b2a..7f51d90dbc 100644 --- a/elf/get-dynamic-info.h +++ b/elf/get-dynamic-info.h @@ -130,8 +130,8 @@ elf_get_dynamic_info (struct link_map *l, ElfW(Dyn) *temp) assert (info[DT_FLAGS] == NULL || (info[DT_FLAGS]->d_un.d_val & ~DF_BIND_NOW) == 0); /* Flags must not be set for ld.so. */ - assert (info[DT_RUNPATH] == NULL); - assert (info[DT_RPATH] == NULL); + info[DT_RUNPATH] == NULL; + info[DT_RPATH] == NULL; #else if (info[DT_FLAGS] != NULL) { diff --git a/localedata/locales/bo_CN b/localedata/locales/bo_CN index d813c103ae..c573d3fe42 100644 --- a/localedata/locales/bo_CN +++ b/localedata/locales/bo_CN @@ -145,8 +145,7 @@ END LC_MEASUREMENT LC_NAME % FIXME - -name_fmt "" +name_fmt "FIXME" % name_gen "FIXME" % name_miss "FIXME" % name_mr "FIXME" diff --git a/localedata/locales/bo_IN b/localedata/locales/bo_IN index 8ab793c833..a1a62808fb 100644 --- a/localedata/locales/bo_IN +++ b/localedata/locales/bo_IN @@ -71,7 +71,7 @@ END LC_MEASUREMENT LC_NAME % FIXME -name_fmt "" +name_fmt "FIXME" % name_gen "FIXME" % name_miss "FIXME" % name_mr "FIXME" diff --git a/nis/nss_compat/compat-grp.c b/nis/nss_compat/compat-grp.c index 78e14d6d3c..c241e8297c 100644 --- a/nis/nss_compat/compat-grp.c +++ b/nis/nss_compat/compat-grp.c @@ -194,9 +194,6 @@ _nss_compat_setgrent (int stayopen) static enum nss_status internal_endgrent (ent_t *ent) { - if (nss_endgrent) - nss_endgrent (); - if (ent->stream != NULL) { fclose (ent->stream); @@ -222,6 +219,9 @@ _nss_compat_endgrent (void) __libc_lock_lock (lock); + if (nss_endgrent) + nss_endgrent (); + result = internal_endgrent (&ext_ent); __libc_lock_unlock (lock); diff --git a/nis/nss_compat/compat-pwd.c b/nis/nss_compat/compat-pwd.c index 127673c596..f6b8d80835 100644 --- a/nis/nss_compat/compat-pwd.c +++ b/nis/nss_compat/compat-pwd.c @@ -311,9 +311,6 @@ _nss_compat_setpwent (int stayopen) static enum nss_status internal_endpwent (ent_t *ent) { - if (nss_endpwent) - nss_endpwent (); - if (ent->stream != NULL) { fclose (ent->stream); @@ -346,6 +343,9 @@ _nss_compat_endpwent (void) __libc_lock_lock (lock); + if (nss_endpwent) + nss_endpwent (); + result = internal_endpwent (&ext_ent); __libc_lock_unlock (lock); diff --git a/nis/nss_compat/compat-spwd.c b/nis/nss_compat/compat-spwd.c index 4890ce6f90..e404841b90 100644 --- a/nis/nss_compat/compat-spwd.c +++ b/nis/nss_compat/compat-spwd.c @@ -169,7 +169,7 @@ copy_spwd_changes (struct spwd *dest, struct spwd *src, } static enum nss_status -internal_setspent (ent_t *ent, int stayopen) +internal_setspent (ent_t *ent, int stayopen, int needent) { enum nss_status status = NSS_STATUS_SUCCESS; @@ -239,7 +239,7 @@ internal_setspent (ent_t *ent, int stayopen) give_spwd_free (&ent->pwd); - if (status == NSS_STATUS_SUCCESS && nss_setspent) + if (needent && status == NSS_STATUS_SUCCESS && nss_setspent) ent->setent_status = nss_setspent (stayopen); return status; @@ -256,7 +256,7 @@ _nss_compat_setspent (int stayopen) if (ni == NULL) init_nss_interface (); - result = internal_setspent (&ext_ent, stayopen); + result = internal_setspent (&ext_ent, stayopen, 1); __libc_lock_unlock (lock); @@ -267,9 +267,6 @@ _nss_compat_setspent (int stayopen) static enum nss_status internal_endspent (ent_t *ent) { - if (nss_endspent) - nss_endspent (); - if (ent->stream != NULL) { fclose (ent->stream); @@ -303,6 +300,9 @@ _nss_compat_endspent (void) __libc_lock_lock (lock); + if (nss_endspent) + nss_endspent (); + result = internal_endspent (&ext_ent); __libc_lock_unlock (lock); @@ -658,7 +658,7 @@ _nss_compat_getspent_r (struct spwd *pwd, char *buffer, size_t buflen, init_nss_interface (); if (ext_ent.stream == NULL) - result = internal_setspent (&ext_ent, 1); + result = internal_setspent (&ext_ent, 1, 1); if (result == NSS_STATUS_SUCCESS) result = internal_getspent_r (pwd, &ext_ent, buffer, buflen, errnop); @@ -830,7 +830,7 @@ _nss_compat_getspnam_r (const char *name, struct spwd *pwd, __libc_lock_unlock (lock); - result = internal_setspent (&ent, 0); + result = internal_setspent (&ent, 0, 0); if (result == NSS_STATUS_SUCCESS) result = internal_getspnam_r (name, pwd, &ent, buffer, buflen, errnop); diff --git a/nss/Makefile b/nss/Makefile index 1fa7f1f397..d6f0139bf5 100644 --- a/nss/Makefile +++ b/nss/Makefile @@ -39,7 +39,7 @@ install-bin := getent makedb makedb-modules = xmalloc hash-string extra-objs += $(makedb-modules:=.o) -tests = test-netdb tst-nss-test1 test-digits-dots +tests = test-netdb tst-nss-test1 test-digits-dots tst-nss-getpwent xtests = bug-erange # Specify rules for the nss_* modules. We have some services. diff --git a/nss/nss_files/files-XXX.c b/nss/nss_files/files-XXX.c index 212b938fdf..2d6602460a 100644 --- a/nss/nss_files/files-XXX.c +++ b/nss/nss_files/files-XXX.c @@ -60,24 +60,23 @@ /* Locks the static variables in this file. */ __libc_lock_define_initialized (static, lock) -/* Maintenance of the shared stream open on the database file. */ +/* Maintenance of the stream open on the database file. For getXXent + operations the stream needs to be held open across calls, the other + getXXbyYY operations all use their own stream. */ static FILE *stream; -static fpos_t position; -static enum { nouse, getent, getby } last_use; -static int keep_stream; /* Open database file if not already opened. */ static enum nss_status -internal_setent (int stayopen) +internal_setent (FILE **stream) { enum nss_status status = NSS_STATUS_SUCCESS; - if (stream == NULL) + if (*stream == NULL) { - stream = fopen (DATAFILE, "rce"); + *stream = fopen (DATAFILE, "rce"); - if (stream == NULL) + if (*stream == NULL) status = errno == EAGAIN ? NSS_STATUS_TRYAGAIN : NSS_STATUS_UNAVAIL; else { @@ -90,7 +89,7 @@ internal_setent (int stayopen) int result; int flags; - result = flags = fcntl (fileno (stream), F_GETFD, 0); + result = flags = fcntl (fileno (*stream), F_GETFD, 0); if (result >= 0) { # ifdef O_CLOEXEC @@ -100,15 +99,15 @@ internal_setent (int stayopen) # endif { flags |= FD_CLOEXEC; - result = fcntl (fileno (stream), F_SETFD, flags); + result = fcntl (fileno (*stream), F_SETFD, flags); } } if (result < 0) { /* Something went wrong. Close the stream and return a failure. */ - fclose (stream); - stream = NULL; + fclose (*stream); + *stream = NULL; status = NSS_STATUS_UNAVAIL; } } @@ -116,11 +115,7 @@ internal_setent (int stayopen) } } else - rewind (stream); - - /* Remember STAYOPEN flag. */ - if (stream != NULL) - keep_stream |= stayopen; + rewind (*stream); return status; } @@ -134,16 +129,7 @@ CONCAT(_nss_files_set,ENTNAME) (int stayopen) __libc_lock_lock (lock); - status = internal_setent (stayopen); - - if (status == NSS_STATUS_SUCCESS && fgetpos (stream, &position) < 0) - { - fclose (stream); - stream = NULL; - status = NSS_STATUS_UNAVAIL; - } - - last_use = getent; + status = internal_setent (&stream); __libc_lock_unlock (lock); @@ -153,12 +139,12 @@ CONCAT(_nss_files_set,ENTNAME) (int stayopen) /* Close the database file. */ static void -internal_endent (void) +internal_endent (FILE **stream) { - if (stream != NULL) + if (*stream != NULL) { - fclose (stream); - stream = NULL; + fclose (*stream); + *stream = NULL; } } @@ -169,10 +155,7 @@ CONCAT(_nss_files_end,ENTNAME) (void) { __libc_lock_lock (lock); - internal_endent (); - - /* Reset STAYOPEN flag. */ - keep_stream = 0; + internal_endent (&stream); __libc_lock_unlock (lock); @@ -227,7 +210,7 @@ get_contents (char *linebuf, size_t len, FILE *stream) /* Parsing the database file into `struct STRUCTURE' data structures. */ static enum nss_status -internal_getent (struct STRUCTURE *result, +internal_getent (FILE *stream, struct STRUCTURE *result, char *buffer, size_t buflen, int *errnop H_ERRNO_PROTO EXTRA_ARGS_DECL) { @@ -300,45 +283,14 @@ CONCAT(_nss_files_get,ENTNAME_r) (struct STRUCTURE *result, char *buffer, { int save_errno = errno; - status = internal_setent (0); + status = internal_setent (&stream); __set_errno (save_errno); - - if (status == NSS_STATUS_SUCCESS && fgetpos (stream, &position) < 0) - { - fclose (stream); - stream = NULL; - status = NSS_STATUS_UNAVAIL; - } } if (status == NSS_STATUS_SUCCESS) - { - /* If the last use was not by the getent function we need the - position the stream. */ - if (last_use != getent) - { - if (fsetpos (stream, &position) < 0) - status = NSS_STATUS_UNAVAIL; - else - last_use = getent; - } - - if (status == NSS_STATUS_SUCCESS) - { - status = internal_getent (result, buffer, buflen, errnop - H_ERRNO_ARG EXTRA_ARGS_VALUE); - - /* Remember this position if we were successful. If the - operation failed we give the user a chance to repeat the - operation (perhaps the buffer was too small). */ - if (status == NSS_STATUS_SUCCESS) - fgetpos (stream, &position); - else - /* We must make sure we reposition the stream the next call. */ - last_use = nouse; - } - } + status = internal_getent (stream, result, buffer, buflen, errnop + H_ERRNO_ARG EXTRA_ARGS_VALUE); __libc_lock_unlock (lock); @@ -364,27 +316,20 @@ _nss_files_get##name##_r (proto, \ size_t buflen, int *errnop H_ERRNO_PROTO) \ { \ enum nss_status status; \ + FILE *stream = NULL; \ \ - __libc_lock_lock (lock); \ - \ - /* Reset file pointer to beginning or open file. */ \ - status = internal_setent (keep_stream); \ + /* Open file. */ \ + status = internal_setent (&stream); \ \ if (status == NSS_STATUS_SUCCESS) \ { \ - /* Tell getent function that we have repositioned the file pointer. */ \ - last_use = getby; \ - \ - while ((status = internal_getent (result, buffer, buflen, errnop \ + while ((status = internal_getent (stream, result, buffer, buflen, errnop \ H_ERRNO_ARG EXTRA_ARGS_VALUE)) \ == NSS_STATUS_SUCCESS) \ { break_if_match } \ \ - if (! keep_stream) \ - internal_endent (); \ + internal_endent (&stream); \ } \ \ - __libc_lock_unlock (lock); \ - \ return status; \ } diff --git a/nss/nss_files/files-alias.c b/nss/nss_files/files-alias.c index 53088f6a8b..4b085d36e9 100644 --- a/nss/nss_files/files-alias.c +++ b/nss/nss_files/files-alias.c @@ -33,23 +33,23 @@ /* Locks the static variables in this file. */ __libc_lock_define_initialized (static, lock) -/* Maintenance of the shared stream open on the database file. */ +/* Maintenance of the stream open on the database file. For getXXent + operations the stream needs to be held open across calls, the other + getXXbyYY operations all use their own stream. */ static FILE *stream; -static fpos_t position; -static enum { nouse, getent, getby } last_use; static enum nss_status -internal_setent (void) +internal_setent (FILE **stream) { enum nss_status status = NSS_STATUS_SUCCESS; - if (stream == NULL) + if (*stream == NULL) { - stream = fopen ("/etc/aliases", "rce"); + *stream = fopen ("/etc/aliases", "rce"); - if (stream == NULL) + if (*stream == NULL) status = errno == EAGAIN ? NSS_STATUS_TRYAGAIN : NSS_STATUS_UNAVAIL; else { @@ -62,7 +62,7 @@ internal_setent (void) int result; int flags; - result = flags = fcntl (fileno (stream), F_GETFD, 0); + result = flags = fcntl (fileno (*stream), F_GETFD, 0); if (result >= 0) { # ifdef O_CLOEXEC @@ -72,14 +72,14 @@ internal_setent (void) # endif { flags |= FD_CLOEXEC; - result = fcntl (fileno (stream), F_SETFD, flags); + result = fcntl (fileno (*stream), F_SETFD, flags); } } if (result < 0) { /* Something went wrong. Close the stream and return a failure. */ - fclose (stream); + fclose (*stream); stream = NULL; status = NSS_STATUS_UNAVAIL; } @@ -88,7 +88,7 @@ internal_setent (void) } } else - rewind (stream); + rewind (*stream); return status; } @@ -102,16 +102,7 @@ _nss_files_setaliasent (void) __libc_lock_lock (lock); - status = internal_setent (); - - if (status == NSS_STATUS_SUCCESS && fgetpos (stream, &position) < 0) - { - fclose (stream); - stream = NULL; - status = NSS_STATUS_UNAVAIL; - } - - last_use = getent; + status = internal_setent (&stream); __libc_lock_unlock (lock); @@ -121,12 +112,12 @@ _nss_files_setaliasent (void) /* Close the database file. */ static void -internal_endent (void) +internal_endent (FILE **stream) { - if (stream != NULL) + if (*stream != NULL) { - fclose (stream); - stream = NULL; + fclose (*stream); + *stream = NULL; } } @@ -137,7 +128,7 @@ _nss_files_endaliasent (void) { __libc_lock_lock (lock); - internal_endent (); + internal_endent (&stream); __libc_lock_unlock (lock); @@ -146,7 +137,7 @@ _nss_files_endaliasent (void) /* Parsing the database file into `struct aliasent' data structures. */ static enum nss_status -get_next_alias (const char *match, struct aliasent *result, +get_next_alias (FILE *stream, const char *match, struct aliasent *result, char *buffer, size_t buflen, int *errnop) { enum nss_status status = NSS_STATUS_NOTFOUND; @@ -397,35 +388,16 @@ _nss_files_getaliasent_r (struct aliasent *result, char *buffer, size_t buflen, /* Be prepared that the set*ent function was not called before. */ if (stream == NULL) - status = internal_setent (); + status = internal_setent (&stream); if (status == NSS_STATUS_SUCCESS) { - /* If the last use was not by the getent function we need the - position the stream. */ - if (last_use != getent) - { - if (fsetpos (stream, &position) < 0) - status = NSS_STATUS_UNAVAIL; - else - last_use = getent; - } + result->alias_local = 1; - if (status == NSS_STATUS_SUCCESS) - { - result->alias_local = 1; - - /* Read lines until we get a definite result. */ - do - status = get_next_alias (NULL, result, buffer, buflen, errnop); - while (status == NSS_STATUS_RETURN); - - /* If we successfully read an entry remember this position. */ - if (status == NSS_STATUS_SUCCESS) - fgetpos (stream, &position); - else - last_use = nouse; - } + /* Read lines until we get a definite result. */ + do + status = get_next_alias (stream, NULL, result, buffer, buflen, errnop); + while (status == NSS_STATUS_RETURN); } __libc_lock_unlock (lock); @@ -440,6 +412,7 @@ _nss_files_getaliasbyname_r (const char *name, struct aliasent *result, { /* Return next entry in host file. */ enum nss_status status = NSS_STATUS_SUCCESS; + FILE *stream = NULL; if (name == NULL) { @@ -447,11 +420,8 @@ _nss_files_getaliasbyname_r (const char *name, struct aliasent *result, return NSS_STATUS_UNAVAIL; } - __libc_lock_lock (lock); - - /* Open the stream or rest it. */ - status = internal_setent (); - last_use = getby; + /* Open the stream. */ + status = internal_setent (&stream); if (status == NSS_STATUS_SUCCESS) { @@ -459,13 +429,11 @@ _nss_files_getaliasbyname_r (const char *name, struct aliasent *result, /* Read lines until we get a definite result. */ do - status = get_next_alias (name, result, buffer, buflen, errnop); + status = get_next_alias (stream, name, result, buffer, buflen, errnop); while (status == NSS_STATUS_RETURN); } - internal_endent (); - - __libc_lock_unlock (lock); + internal_endent (&stream); return status; } diff --git a/nss/nss_files/files-hosts.c b/nss/nss_files/files-hosts.c index ab64eadabb..8de4b1b749 100644 --- a/nss/nss_files/files-hosts.c +++ b/nss/nss_files/files-hosts.c @@ -120,14 +120,13 @@ _nss_files_gethostbyname3_r (const char *name, int af, struct hostent *result, char *buffer, size_t buflen, int *errnop, int *herrnop, int32_t *ttlp, char **canonp) { + FILE *stream = NULL; uintptr_t pad = -(uintptr_t) buffer % __alignof__ (struct hostent_data); buffer += pad; buflen = buflen > pad ? buflen - pad : 0; - __libc_lock_lock (lock); - - /* Reset file pointer to beginning or open file. */ - enum nss_status status = internal_setent (keep_stream); + /* Open file. */ + enum nss_status status = internal_setent (&stream); if (status == NSS_STATUS_SUCCESS) { @@ -135,10 +134,7 @@ _nss_files_gethostbyname3_r (const char *name, int af, struct hostent *result, addresses to IPv6 addresses really the right thing to do? */ int flags = ((_res.options & RES_USE_INET6) ? AI_V4MAPPED : 0); - /* Tell getent function that we have repositioned the file pointer. */ - last_use = getby; - - while ((status = internal_getent (result, buffer, buflen, errnop, + while ((status = internal_getent (stream, result, buffer, buflen, errnop, herrnop, af, flags)) == NSS_STATUS_SUCCESS) { @@ -165,7 +161,7 @@ _nss_files_gethostbyname3_r (const char *name, int af, struct hostent *result, bufferend = (char *) &result->h_aliases[naliases + 1]; again: - while ((status = internal_getent (&tmp_result_buf, tmp_buffer, + while ((status = internal_getent (stream, &tmp_result_buf, tmp_buffer, tmp_buflen, errnop, herrnop, af, flags)) == NSS_STATUS_SUCCESS) @@ -341,15 +337,12 @@ _nss_files_gethostbyname3_r (const char *name, int af, struct hostent *result, free (tmp_buffer); } - if (! keep_stream) - internal_endent (); + internal_endent (&stream); } if (canonp && status == NSS_STATUS_SUCCESS) *canonp = result->h_name; - __libc_lock_unlock (lock); - return status; } @@ -378,16 +371,13 @@ _nss_files_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat, char *buffer, size_t buflen, int *errnop, int *herrnop, int32_t *ttlp) { - __libc_lock_lock (lock); + FILE *stream = NULL; - /* Reset file pointer to beginning or open file. */ - enum nss_status status = internal_setent (keep_stream); + /* Open file. */ + enum nss_status status = internal_setent (&stream); if (status == NSS_STATUS_SUCCESS) { - /* Tell getent function that we have repositioned the file pointer. */ - last_use = getby; - bool any = false; bool got_canon = false; while (1) @@ -399,7 +389,7 @@ _nss_files_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat, buflen = buflen > pad ? buflen - pad : 0; struct hostent result; - status = internal_getent (&result, buffer, buflen, errnop, + status = internal_getent (stream, &result, buffer, buflen, errnop, herrnop, AF_UNSPEC, 0); if (status != NSS_STATUS_SUCCESS) break; @@ -475,8 +465,7 @@ _nss_files_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat, status = NSS_STATUS_SUCCESS; } - if (! keep_stream) - internal_endent (); + internal_endent (&stream); } else if (status == NSS_STATUS_TRYAGAIN) { @@ -489,7 +478,5 @@ _nss_files_gethostbyname4_r (const char *name, struct gaih_addrtuple **pat, *herrnop = NO_DATA; } - __libc_lock_unlock (lock); - return status; } diff --git a/nss/tst-nss-getpwent.c b/nss/tst-nss-getpwent.c new file mode 100644 index 0000000000..f2e8abce60 --- /dev/null +++ b/nss/tst-nss-getpwent.c @@ -0,0 +1,118 @@ +/* Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <pwd.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +int +do_test (void) +{ + /* Count the number of entries in the password database, and fetch + data from the first and last entries. */ + size_t count = 0; + struct passwd * pw; + char *first_name = NULL; + uid_t first_uid = 0; + char *last_name = NULL; + uid_t last_uid = 0; + setpwent (); + while ((pw = getpwent ()) != NULL) + { + if (first_name == NULL) + { + first_name = strdup (pw->pw_name); + if (first_name == NULL) + { + printf ("strdup: %m\n"); + return 1; + } + first_uid = pw->pw_uid; + } + + free (last_name); + last_name = strdup (pw->pw_name); + if (last_name == NULL) + { + printf ("strdup: %m\n"); + return 1; + } + last_uid = pw->pw_uid; + ++count; + } + endpwent (); + + if (count == 0) + { + printf ("No entries in the password database.\n"); + return 0; + } + + /* Try again, this time interleaving with name-based and UID-based + lookup operations. The counts do not match if the interleaved + lookups affected the enumeration. */ + size_t new_count = 0; + setpwent (); + while ((pw = getpwent ()) != NULL) + { + if (new_count == count) + { + printf ("Additional entry in the password database.\n"); + return 1; + } + ++new_count; + struct passwd *pw2 = getpwnam (first_name); + if (pw2 == NULL) + { + printf ("getpwnam (%s) failed: %m\n", first_name); + return 1; + } + pw2 = getpwnam (last_name); + if (pw2 == NULL) + { + printf ("getpwnam (%s) failed: %m\n", last_name); + return 1; + } + pw2 = getpwuid (first_uid); + if (pw2 == NULL) + { + printf ("getpwuid (%llu) failed: %m\n", + (unsigned long long) first_uid); + return 1; + } + pw2 = getpwuid (last_uid); + if (pw2 == NULL) + { + printf ("getpwuid (%llu) failed: %m\n", + (unsigned long long) last_uid); + return 1; + } + } + endpwent (); + if (new_count < count) + { + printf ("Missing entry in the password database.\n"); + return 1; + } + + return 0; +} + +#define TEST_FUNCTION do_test () +#include "../test-skeleton.c" diff --git a/resolv/nss_dns/dns-host.c b/resolv/nss_dns/dns-host.c index 755832e7c1..1a049007fc 100644 --- a/resolv/nss_dns/dns-host.c +++ b/resolv/nss_dns/dns-host.c @@ -615,7 +615,8 @@ getanswer_r (const querybuf *answer, int anslen, const char *qname, int qtype, int have_to_map = 0; uintptr_t pad = -(uintptr_t) buffer % __alignof__ (struct host_data); buffer += pad; - if (__glibc_unlikely (buflen < sizeof (struct host_data) + pad)) + buflen = buflen > pad ? buflen - pad : 0; + if (__glibc_unlikely (buflen < sizeof (struct host_data))) { /* The buffer is too small. */ too_small: diff --git a/sysdeps/ieee754/dbl-64/Makefile b/sysdeps/ieee754/dbl-64/Makefile index 35f545ff8e..5557c75b45 100644 --- a/sysdeps/ieee754/dbl-64/Makefile +++ b/sysdeps/ieee754/dbl-64/Makefile @@ -2,4 +2,5 @@ ifeq ($(subdir),math) # branred depends on precise IEEE double rounding CFLAGS-branred.c = $(config-cflags-nofma) CFLAGS-e_sqrt.c = $(config-cflags-nofma) +CFLAGS-e_pow.c = $(config-cflags-nofma) endif diff --git a/sysdeps/powerpc/bits/atomic.h b/sysdeps/powerpc/bits/atomic.h index 2ffba48d55..b05b0f7aa0 100644 --- a/sysdeps/powerpc/bits/atomic.h +++ b/sysdeps/powerpc/bits/atomic.h @@ -77,7 +77,6 @@ typedef uintmax_t uatomic_max_t; #endif #define atomic_full_barrier() __asm ("sync" ::: "memory") -#define atomic_write_barrier() __asm ("eieio" ::: "memory") #define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval) \ ({ \ @@ -153,6 +152,34 @@ typedef uintmax_t uatomic_max_t; __val; \ }) +#define __arch_atomic_exchange_and_add_32_acq(mem, value) \ + ({ \ + __typeof (*mem) __val, __tmp; \ + __asm __volatile ("1: lwarx %0,0,%3" MUTEX_HINT_ACQ "\n" \ + " add %1,%0,%4\n" \ + " stwcx. %1,0,%3\n" \ + " bne- 1b\n" \ + __ARCH_ACQ_INSTR \ + : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ + : "b" (mem), "r" (value), "m" (*mem) \ + : "cr0", "memory"); \ + __val; \ + }) + +#define __arch_atomic_exchange_and_add_32_rel(mem, value) \ + ({ \ + __typeof (*mem) __val, __tmp; \ + __asm __volatile (__ARCH_REL_INSTR "\n" \ + "1: lwarx %0,0,%3" MUTEX_HINT_REL "\n" \ + " add %1,%0,%4\n" \ + " stwcx. %1,0,%3\n" \ + " bne- 1b" \ + : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ + : "b" (mem), "r" (value), "m" (*mem) \ + : "cr0", "memory"); \ + __val; \ + }) + #define __arch_atomic_increment_val_32(mem) \ ({ \ __typeof (*(mem)) __val; \ @@ -253,6 +280,28 @@ typedef uintmax_t uatomic_max_t; abort (); \ __result; \ }) +#define atomic_exchange_and_add_acq(mem, value) \ + ({ \ + __typeof (*(mem)) __result; \ + if (sizeof (*mem) == 4) \ + __result = __arch_atomic_exchange_and_add_32_acq (mem, value); \ + else if (sizeof (*mem) == 8) \ + __result = __arch_atomic_exchange_and_add_64_acq (mem, value); \ + else \ + abort (); \ + __result; \ + }) +#define atomic_exchange_and_add_rel(mem, value) \ + ({ \ + __typeof (*(mem)) __result; \ + if (sizeof (*mem) == 4) \ + __result = __arch_atomic_exchange_and_add_32_rel (mem, value); \ + else if (sizeof (*mem) == 8) \ + __result = __arch_atomic_exchange_and_add_64_rel (mem, value); \ + else \ + abort (); \ + __result; \ + }) #define atomic_increment_val(mem) \ ({ \ diff --git a/sysdeps/powerpc/nptl/elide.h b/sysdeps/powerpc/nptl/elide.h new file mode 100644 index 0000000000..01572d99ce --- /dev/null +++ b/sysdeps/powerpc/nptl/elide.h @@ -0,0 +1,111 @@ +/* elide.h: Generic lock elision support for powerpc. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef ELIDE_PPC_H +# define ELIDE_PPC_H + +#ifdef ENABLE_LOCK_ELISION +# include <htm.h> +# include <elision-conf.h> + +/* Returns true if the lock defined by is_lock_free as elided. + ADAPT_COUNT is a pointer to per-lock state variable. */ + +static inline bool +__elide_lock (uint8_t *adapt_count, int is_lock_free) +{ + if (*adapt_count > 0) + { + (*adapt_count)--; + return false; + } + + for (int i = __elision_aconf.try_tbegin; i > 0; i--) + { + if (__builtin_tbegin (0)) + { + if (is_lock_free) + return true; + /* Lock was busy. */ + __builtin_tabort (_ABORT_LOCK_BUSY); + } + else + { + /* A persistent failure indicates that a retry will probably + result in another failure. Use normal locking now and + for the next couple of calls. */ + if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ())) + { + if (__elision_aconf.skip_lock_internal_abort > 0) + *adapt_count = __elision_aconf.skip_lock_internal_abort; + break; + } + /* Same logic as above, but for a number of temporary failures in a + a row. */ + else if (__elision_aconf.skip_lock_out_of_tbegin_retries > 0 + && __elision_aconf.try_tbegin > 0) + *adapt_count = __elision_aconf.skip_lock_out_of_tbegin_retries; + } + } + + return false; +} + +# define ELIDE_LOCK(adapt_count, is_lock_free) \ + __elide_lock (&(adapt_count), is_lock_free) + + +static inline bool +__elide_trylock (uint8_t *adapt_count, int is_lock_free, int write) +{ + if (__elision_aconf.try_tbegin > 0) + { + if (write) + __builtin_tabort (_ABORT_NESTED_TRYLOCK); + return __elide_lock (adapt_count, is_lock_free); + } + return false; +} + +# define ELIDE_TRYLOCK(adapt_count, is_lock_free, write) \ + __elide_trylock (&(adapt_count), is_lock_free, write) + + +static inline bool +__elide_unlock (int is_lock_free) +{ + if (is_lock_free) + { + __builtin_tend (0); + return true; + } + return false; +} + +# define ELIDE_UNLOCK(is_lock_free) \ + __elide_unlock (is_lock_free) + +# else + +# define ELIDE_LOCK(adapt_count, is_lock_free) 0 +# define ELIDE_TRYLOCK(adapt_count, is_lock_free, write) 0 +# define ELIDE_UNLOCK(is_lock_free) 0 + +#endif /* ENABLE_LOCK_ELISION */ + +#endif diff --git a/sysdeps/powerpc/nptl/tcb-offsets.sym b/sysdeps/powerpc/nptl/tcb-offsets.sym index f996759027..d955142aff 100644 --- a/sysdeps/powerpc/nptl/tcb-offsets.sym +++ b/sysdeps/powerpc/nptl/tcb-offsets.sym @@ -19,6 +19,7 @@ POINTER_GUARD (offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof ( TAR_SAVE (offsetof (tcbhead_t, tar_save) - TLS_TCB_OFFSET - sizeof (tcbhead_t)) DSO_SLOT1 (offsetof (tcbhead_t, dso_slot1) - TLS_TCB_OFFSET - sizeof (tcbhead_t)) DSO_SLOT2 (offsetof (tcbhead_t, dso_slot2) - TLS_TCB_OFFSET - sizeof (tcbhead_t)) +TM_CAPABLE (offsetof (tcbhead_t, tm_capable) - TLS_TCB_OFFSET - sizeof (tcbhead_t)) #ifndef __ASSUME_PRIVATE_FUTEX PRIVATE_FUTEX_OFFSET thread_offsetof (header.private_futex) #endif diff --git a/sysdeps/powerpc/nptl/tls.h b/sysdeps/powerpc/nptl/tls.h index b80a5fbf54..9877b73ba1 100644 --- a/sysdeps/powerpc/nptl/tls.h +++ b/sysdeps/powerpc/nptl/tls.h @@ -63,6 +63,8 @@ typedef union dtv are private. */ typedef struct { + /* Indicate if HTM capable (ISA 2.07). */ + int tm_capable; /* Reservation for Dynamic System Optimizer ABI. */ uintptr_t dso_slot2; uintptr_t dso_slot1; @@ -130,11 +132,17 @@ register void *__thread_register __asm__ ("r13"); special attention since 'errno' is not yet available and if the operation can cause a failure 'errno' must not be touched. */ # define TLS_INIT_TP(tcbp) \ - (__thread_register = (void *) (tcbp) + TLS_TCB_OFFSET, NULL) + ({ \ + __thread_register = (void *) (tcbp) + TLS_TCB_OFFSET; \ + THREAD_SET_TM_CAPABLE (GLRO (dl_hwcap2) & PPC_FEATURE2_HAS_HTM ? 1 : 0); \ + NULL; \ + }) /* Value passed to 'clone' for initialization of the thread register. */ # define TLS_DEFINE_INIT_TP(tp, pd) \ - void *tp = (void *) (pd) + TLS_TCB_OFFSET + TLS_PRE_TCB_SIZE + void *tp = (void *) (pd) + TLS_TCB_OFFSET + TLS_PRE_TCB_SIZE; \ + (((tcbhead_t *) ((char *) tp - TLS_TCB_OFFSET))[-1].tm_capable) = \ + THREAD_GET_TM_CAPABLE (); /* Return the address of the dtv for the current thread. */ # define THREAD_DTV() \ @@ -188,6 +196,13 @@ register void *__thread_register __asm__ ("r13"); + TLS_PRE_TCB_SIZE))[-1].pointer_guard \ = THREAD_GET_POINTER_GUARD()) +/* tm_capable field in TCB head. */ +# define THREAD_GET_TM_CAPABLE() \ + (((tcbhead_t *) ((char *) __thread_register \ + - TLS_TCB_OFFSET))[-1].tm_capable) +# define THREAD_SET_TM_CAPABLE(value) \ + (THREAD_GET_TM_CAPABLE () = (value)) + /* l_tls_offset == 0 is perfectly valid on PPC, so we have to use some different value to mean unset l_tls_offset. */ # define NO_TLS_OFFSET -1 diff --git a/sysdeps/powerpc/powerpc32/bits/atomic.h b/sysdeps/powerpc/powerpc32/bits/atomic.h index 7613bdc485..7422262dc1 100644 --- a/sysdeps/powerpc/powerpc32/bits/atomic.h +++ b/sysdeps/powerpc/powerpc32/bits/atomic.h @@ -95,6 +95,12 @@ #define __arch_atomic_exchange_and_add_64(mem, value) \ ({ abort (); (*mem) = (value); }) +#define __arch_atomic_exchange_and_add_64_acq(mem, value) \ + ({ abort (); (*mem) = (value); }) + +#define __arch_atomic_exchange_and_add_64_rel(mem, value) \ + ({ abort (); (*mem) = (value); }) + #define __arch_atomic_increment_val_64(mem) \ ({ abort (); (*mem)++; }) @@ -117,6 +123,7 @@ # ifndef UP # define __ARCH_REL_INSTR "lwsync" # endif +# define atomic_write_barrier() __asm ("lwsync" ::: "memory") #else /* * Older powerpc32 processors don't support the new "light weight" @@ -124,6 +131,7 @@ * for all powerpc32 applications. */ # define atomic_read_barrier() __asm ("sync" ::: "memory") +# define atomic_write_barrier() __asm ("sync" ::: "memory") #endif /* diff --git a/sysdeps/powerpc/powerpc32/sysdep.h b/sysdeps/powerpc/powerpc32/sysdep.h index c8a56aadbf..261ca5f86b 100644 --- a/sysdeps/powerpc/powerpc32/sysdep.h +++ b/sysdeps/powerpc/powerpc32/sysdep.h @@ -88,7 +88,23 @@ GOT_LABEL: ; \ cfi_endproc; \ ASM_SIZE_DIRECTIVE(name) +#if !defined IS_IN_rtld && defined (ENABLE_LOCK_ELISION) +# define ABORT_TRANSACTION \ + cmpwi 2,0; \ + beq 1f; \ + lwz 0,TM_CAPABLE(2); \ + cmpwi 0,0; \ + beq 1f; \ + li 11,_ABORT_SYSCALL; \ + tabort. 11; \ + .align 4; \ +1: +#else +# define ABORT_TRANSACTION +#endif + #define DO_CALL(syscall) \ + ABORT_TRANSACTION \ li 0,syscall; \ sc diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h index 527fe7c133..e64cb9fa54 100644 --- a/sysdeps/powerpc/powerpc64/bits/atomic.h +++ b/sysdeps/powerpc/powerpc64/bits/atomic.h @@ -97,7 +97,7 @@ ({ \ unsigned long __tmp; \ __asm __volatile (__ARCH_REL_INSTR "\n" \ - "1: ldarx %0,0,%2" MUTEX_HINT_REL "\n" \ + "1: ldarx %0,0,%1" MUTEX_HINT_REL "\n" \ " subf. %0,%2,%0\n" \ " bne 2f\n" \ " stdcx. %3,0,%1\n" \ @@ -183,6 +183,34 @@ __val; \ }) +#define __arch_atomic_exchange_and_add_64_acq(mem, value) \ + ({ \ + __typeof (*mem) __val, __tmp; \ + __asm __volatile ("1: ldarx %0,0,%3" MUTEX_HINT_ACQ "\n" \ + " add %1,%0,%4\n" \ + " stdcx. %1,0,%3\n" \ + " bne- 1b\n" \ + __ARCH_ACQ_INSTR \ + : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ + : "b" (mem), "r" (value), "m" (*mem) \ + : "cr0", "memory"); \ + __val; \ + }) + +#define __arch_atomic_exchange_and_add_64_rel(mem, value) \ + ({ \ + __typeof (*mem) __val, __tmp; \ + __asm __volatile (__ARCH_REL_INSTR "\n" \ + "1: ldarx %0,0,%3" MUTEX_HINT_REL "\n" \ + " add %1,%0,%4\n" \ + " stdcx. %1,0,%3\n" \ + " bne- 1b" \ + : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \ + : "b" (mem), "r" (value), "m" (*mem) \ + : "cr0", "memory"); \ + __val; \ + }) + #define __arch_atomic_increment_val_64(mem) \ ({ \ __typeof (*(mem)) __val; \ @@ -234,6 +262,7 @@ #ifndef UP # define __ARCH_REL_INSTR "lwsync" #endif +#define atomic_write_barrier() __asm ("lwsync" ::: "memory") /* * Include the rest of the atomic ops macros which are common to both diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 82722fb69f..b7ea28420f 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -2,24 +2,26 @@ ifeq ($(subdir),string) sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \ memcmp-ppc64 memset-power7 memset-power6 memset-power4 \ - memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \ + memset-ppc64 memset-power8 \ mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \ memrchr-power7 memrchr-ppc64 rawmemchr-power7 \ rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \ strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \ - strncase-power7 strncase_l-power7 strncmp-power7 \ - strncmp-power4 strncmp-ppc64 strchr-power7 strchr-ppc64 \ + strncase-power7 strncase_l-power7 \ + strncmp-power8 strncmp-power7 strncmp-power4 strncmp-ppc64 \ + strchr-power7 strchr-ppc64 \ strchrnul-power7 strchrnul-ppc64 wcschr-power7 \ wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \ wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \ wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \ - strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \ + strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \ + stpcpy-power7 stpcpy-ppc64 \ strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \ - strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \ - strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \ - stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \ - strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \ - bcopy-ppc64 + strncpy-power7 strncpy-ppc64 \ + stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \ + strcmp-power8 strcmp-power7 strcmp-ppc64 \ + strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \ + memmove-ppc64 bcopy-ppc64 strncpy-power8 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c index ed83541fa5..298cf005a1 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c +++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c @@ -26,14 +26,17 @@ extern __typeof (bzero) __bzero_ppc attribute_hidden; extern __typeof (bzero) __bzero_power4 attribute_hidden; extern __typeof (bzero) __bzero_power6 attribute_hidden; extern __typeof (bzero) __bzero_power7 attribute_hidden; +extern __typeof (bzero) __bzero_power8 attribute_hidden; libc_ifunc (__bzero, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __bzero_power7 : - (hwcap & PPC_FEATURE_ARCH_2_05) + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __bzero_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __bzero_power7 : + (hwcap & PPC_FEATURE_ARCH_2_05) ? __bzero_power6 : (hwcap & PPC_FEATURE_POWER4) - ? __bzero_power4 + ? __bzero_power4 : __bzero_ppc); weak_alias (__bzero, bzero) diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index a574487f2f..bd92cf6faa 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -34,6 +34,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, size_t i = 0; unsigned long int hwcap = GLRO(dl_hwcap); + unsigned long int hwcap2 = GLRO(dl_hwcap2); + /* hwcap contains only the latest supported ISA, the code checks which is and fills the previous supported ones. */ if (hwcap & PPC_FEATURE_ARCH_2_06) @@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c. */ IFUNC_IMPL (i, name, memset, + IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __memset_power8) IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX, __memset_power7) IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05, @@ -79,6 +83,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c. */ IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strcpy_power8) IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX, __strcpy_power7) IFUNC_IMPL_ADD (array, i, strcpy, 1, @@ -86,6 +92,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/stpcpy.c. */ IFUNC_IMPL (i, name, stpcpy, + IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __stpcpy_power8) IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_HAS_VSX, __stpcpy_power7) IFUNC_IMPL_ADD (array, i, stpcpy, 1, @@ -100,6 +108,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c. */ IFUNC_IMPL (i, name, strncmp, + IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strncmp_power8) IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_HAS_VSX, __strncmp_power7) IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_POWER4, @@ -134,6 +144,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c. */ IFUNC_IMPL (i, name, bzero, + IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __bzero_power8) IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX, __bzero_power7) IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05, @@ -266,33 +278,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_ppc)) - /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c. */ - IFUNC_IMPL (i, name, strspn, - IFUNC_IMPL_ADD (array, i, strspn, - hwcap & PPC_FEATURE_HAS_VSX, - __strspn_power7) - IFUNC_IMPL_ADD (array, i, strspn, 1, - __strspn_ppc)) - - /* Support sysdeps/powerpc/powerpc64/multiarch/strcspn.c. */ - IFUNC_IMPL (i, name, strcspn, - IFUNC_IMPL_ADD (array, i, strcspn, - hwcap & PPC_FEATURE_HAS_VSX, - __strcspn_power7) - IFUNC_IMPL_ADD (array, i, strcspn, 1, - __strcspn_ppc)) - - /* Support sysdeps/powerpc/powerpc64/multiarch/strpbrk.c. */ - IFUNC_IMPL (i, name, strpbrk, - IFUNC_IMPL_ADD (array, i, strpbrk, - hwcap & PPC_FEATURE_HAS_VSX, - __strpbrk_power7) - IFUNC_IMPL_ADD (array, i, strpbrk, 1, - __strpbrk_ppc)) - /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */ IFUNC_IMPL (i, name, strncpy, IFUNC_IMPL_ADD (array, i, strncpy, + hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strncpy_power8) + IFUNC_IMPL_ADD (array, i, strncpy, hwcap & PPC_FEATURE_HAS_VSX, __strncpy_power7) IFUNC_IMPL_ADD (array, i, strncpy, 1, @@ -301,6 +292,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, IFUNC_IMPL_ADD (array, i, stpncpy, + hwcap2 & PPC_FEATURE2_ARCH_2_07, + __stpncpy_power8) + IFUNC_IMPL_ADD (array, i, stpncpy, hwcap & PPC_FEATURE_HAS_VSX, __stpncpy_power7) IFUNC_IMPL_ADD (array, i, stpncpy, 1, @@ -309,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */ IFUNC_IMPL (i, name, strcmp, IFUNC_IMPL_ADD (array, i, strcmp, + hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strcmp_power8) + IFUNC_IMPL_ADD (array, i, strcmp, hwcap & PPC_FEATURE_HAS_VSX, __strcmp_power7) IFUNC_IMPL_ADD (array, i, strcmp, 1, @@ -317,6 +314,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c. */ IFUNC_IMPL (i, name, strcat, IFUNC_IMPL_ADD (array, i, strcat, + hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strcat_power8) + IFUNC_IMPL_ADD (array, i, strcat, hwcap & PPC_FEATURE_HAS_VSX, __strcat_power7) IFUNC_IMPL_ADD (array, i, strcat, 1, diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S index 968dc24bd3..1291fb7339 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S +++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S @@ -37,5 +37,7 @@ #undef libc_hidden_builtin_def #define libc_hidden_builtin_def(name) -#define NO_BZERO_IMPL +#undef __bzero +#define __bzero __bzero_power4 + #include <sysdeps/powerpc/powerpc64/power4/memset.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S index 65519b91f1..3dc199c535 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S +++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S @@ -37,5 +37,7 @@ #undef libc_hidden_builtin_def #define libc_hidden_builtin_def(name) -#define NO_BZERO_IMPL +#undef __bzero +#define __bzero __bzero_power6 + #include <sysdeps/powerpc/powerpc64/power6/memset.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S index 86765e74ab..fb1a3423ee 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S +++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S @@ -37,5 +37,6 @@ #undef libc_hidden_builtin_def #define libc_hidden_builtin_def(name) -#define NO_BZERO_IMPL +#undef __bzero +#define __bzero __bzero_power7 #include <sysdeps/powerpc/powerpc64/power7/memset.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S new file mode 100644 index 0000000000..e8a604b000 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S @@ -0,0 +1,43 @@ +/* Optimized memset implementation for PowerPC64/POWER8. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__memset_power8) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__memset_power8): \ + cfi_startproc; \ + LOCALENTRY(__memset_power8) + +#undef END_GEN_TB +#define END_GEN_TB(name, mask) \ + cfi_endproc; \ + TRACEBACK_MASK(__memset_power8,mask) \ + END_2(__memset_power8) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#undef __bzero +#define __bzero __bzero_power8 + +#include <sysdeps/powerpc/powerpc64/power8/memset.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c index aa2ae7056e..9c7ed10c87 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memset.c +++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c @@ -32,16 +32,19 @@ extern __typeof (__redirect_memset) __memset_ppc attribute_hidden; extern __typeof (__redirect_memset) __memset_power4 attribute_hidden; extern __typeof (__redirect_memset) __memset_power6 attribute_hidden; extern __typeof (__redirect_memset) __memset_power7 attribute_hidden; +extern __typeof (__redirect_memset) __memset_power8 attribute_hidden; /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ libc_ifunc (__libc_memset, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __memset_power7 : - (hwcap & PPC_FEATURE_ARCH_2_05) + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __memset_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __memset_power7 : + (hwcap & PPC_FEATURE_ARCH_2_05) ? __memset_power6 : (hwcap & PPC_FEATURE_POWER4) - ? __memset_power4 + ? __memset_power4 : __memset_ppc); #undef memset diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S index 889dfeea8e..66e6f708bd 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S +++ b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S @@ -1,5 +1,5 @@ -/* Optimized strspn implementation for POWER7. - Copyright (C) 2014 Free Software Foundation, Inc. +/* Optimized stpcpy implementation for POWER8/PPC64. + Copyright (C) 2015 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,20 +21,20 @@ #undef EALIGN #define EALIGN(name, alignt, words) \ .section ".text"; \ - ENTRY_2(__strspn_power7) \ + ENTRY_2(__stpcpy_power8) \ .align ALIGNARG(alignt); \ EALIGN_W_##words; \ - BODY_LABEL(__strspn_power7): \ + BODY_LABEL(__stpcpy_power8): \ cfi_startproc; \ - LOCALENTRY(__strspn_power7) + LOCALENTRY(__stpcpy_power8) #undef END #define END(name) \ cfi_endproc; \ - TRACEBACK(__strspn_power7) \ - END_2(__strspn_power7) + TRACEBACK(__stpcpy_power8) \ + END_2(__stpcpy_power8) #undef libc_hidden_builtin_def #define libc_hidden_builtin_def(name) -#include <sysdeps/powerpc/powerpc64/power7/strspn.S> +#include <sysdeps/powerpc/powerpc64/power8/stpcpy.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S new file mode 100644 index 0000000000..d5d835de91 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S @@ -0,0 +1,39 @@ +/* Optimized stpncpy implementation for POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define USE_AS_STPNCPY + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__stpncpy_power8) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__stpncpy_power8): \ + cfi_startproc; \ + LOCALENTRY(__stpncpy_power8) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__stpncpy_power8) \ + END_2(__stpncpy_power8) + +#include <sysdeps/powerpc/powerpc64/power8/stpncpy.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c index dbf85214a2..3ee50e527c 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c @@ -23,10 +23,13 @@ extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden; extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden; +extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden; libc_ifunc (__stpncpy, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __stpncpy_power7 + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __stpncpy_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __stpncpy_power7 : __stpncpy_ppc); weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c index 8dea70edc1..6c7544c959 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014 Free Software Foundation, Inc. +/* Copyright (C) 2015 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -13,18 +13,18 @@ You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ + <http://www.gnu.org/licenses/ >. */ #include <string.h> -#define STRPBRK __strpbrk_ppc -#ifdef SHARED +#define STRCAT __strcat_power8 -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) \ - __hidden_ver1 (__strpbrk_ppc, __GI_strpbrk, __strpbrk_ppc); -#endif +#undef libc_hidden_def +#define libc_hidden_def(name) -extern __typeof (strpbrk) __strpbrk_ppc attribute_hidden; +extern typeof (strcpy) __strcpy_power8; +extern typeof (strlen) __strlen_power7; -#include <string/strpbrk.c> +#define strcpy __strcpy_power8 +#define strlen __strlen_power7 +#include <sysdeps/powerpc/strcat.c> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c index 847a62de52..289e9b2365 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c @@ -23,9 +23,12 @@ extern __typeof (strcat) __strcat_ppc attribute_hidden; extern __typeof (strcat) __strcat_power7 attribute_hidden; +extern __typeof (strcat) __strcat_power8 attribute_hidden; libc_ifunc (strcat, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __strcat_power7 + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strcat_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strcat_power7 : __strcat_ppc); #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S index 663ca36568..dc4bfac9ee 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S @@ -1,5 +1,5 @@ -/* Optimized strpbrk implementation for POWER7. - Copyright (C) 2014 Free Software Foundation, Inc. +/* Optimized strcmp implementation for POWER8/PPC64. + Copyright (C) 2015 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,20 +21,20 @@ #undef EALIGN #define EALIGN(name, alignt, words) \ .section ".text"; \ - ENTRY_2(__strpbrk_power7) \ + ENTRY_2(__strcmp_power8) \ .align ALIGNARG(alignt); \ EALIGN_W_##words; \ - BODY_LABEL(__strpbrk_power7): \ + BODY_LABEL(__strcmp_power8): \ cfi_startproc; \ - LOCALENTRY(__strpbrk_power7) + LOCALENTRY(__strcmp_power8) #undef END #define END(name) \ cfi_endproc; \ - TRACEBACK(__strpbrk_power7) \ - END_2(__strpbrk_power7) + TRACEBACK(__strcmp_power8) \ + END_2(__strcmp_power8) #undef libc_hidden_builtin_def #define libc_hidden_builtin_def(name) -#include <sysdeps/powerpc/powerpc64/power7/strpbrk.S> +#include <sysdeps/powerpc/powerpc64/power8/strcmp.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c index 2013301aa1..c711969992 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c @@ -23,9 +23,12 @@ extern __typeof (strcmp) __strcmp_ppc attribute_hidden; extern __typeof (strcmp) __strcmp_power7 attribute_hidden; +extern __typeof (strcmp) __strcmp_power8 attribute_hidden; libc_ifunc (strcmp, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __strcmp_power7 + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strcmp_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strcmp_power7 : __strcmp_ppc); #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S index 02ffcc89b8..64cbc163a4 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S @@ -1,5 +1,5 @@ -/* Optimized strcspn implementation for POWER7. - Copyright (C) 2014 Free Software Foundation, Inc. +/* Optimized strcpy implementation for POWER8/PPC64. + Copyright (C) 2015 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,20 +21,20 @@ #undef EALIGN #define EALIGN(name, alignt, words) \ .section ".text"; \ - ENTRY_2(__strcspn_power7) \ + ENTRY_2(__strcpy_power8) \ .align ALIGNARG(alignt); \ EALIGN_W_##words; \ - BODY_LABEL(__strcspn_power7): \ + BODY_LABEL(__strcpy_power8): \ cfi_startproc; \ - LOCALENTRY(__strcspn_power7) + LOCALENTRY(__strcpy_power8) #undef END #define END(name) \ cfi_endproc; \ - TRACEBACK(__strcspn_power7) \ - END_2(__strcspn_power7) + TRACEBACK(__strcpy_power8) \ + END_2(__strcpy_power8) #undef libc_hidden_builtin_def #define libc_hidden_builtin_def(name) -#include <sysdeps/powerpc/powerpc64/power7/strcspn.S> +#include <sysdeps/powerpc/powerpc64/power8/strcpy.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c index 1b6e9e0665..20ef73f7d5 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c @@ -23,9 +23,12 @@ extern __typeof (strcpy) __strcpy_ppc attribute_hidden; extern __typeof (strcpy) __strcpy_power7 attribute_hidden; +extern __typeof (strcpy) __strcpy_power8 attribute_hidden; libc_ifunc (strcpy, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __strcpy_power7 + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strcpy_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strcpy_power7 : __strcpy_ppc); #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c index 5f8b61054d..39b1aebe9b 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014 Free Software Foundation, Inc. +/* Copyright (C) 2015 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -13,18 +13,19 @@ You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ + <http://www.gnu.org/licenses/ >. */ #include <string.h> -#define STRCSPN __strcspn_ppc -#ifdef SHARED +#define STRNCAT __strncat_power7 -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) \ - __hidden_ver1 (__strcspn_ppc, __GI_strcspn, __strcspn_ppc); -#endif +extern __typeof (strncat) __strncat_power7 attribute_hidden; +extern __typeof (strlen) __strlen_power7 attribute_hidden; +extern __typeof (strnlen) __strnlen_power7 attribute_hidden; +extern __typeof (memcpy) __memcpy_power7 attribute_hidden; -extern __typeof (strcspn) __strcspn_ppc attribute_hidden; +#define strlen __strlen_power7 +#define __strnlen __strnlen_power7 +#define memcpy __memcpy_power7 -#include <string/strcspn.c> +#include <string/strncat.c> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S new file mode 100644 index 0000000000..8d7223d256 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S @@ -0,0 +1,40 @@ +/* Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef EALIGN +#define EALIGN(name,alignt,words) \ + .section ".text"; \ + ENTRY_2(__strncmp_power8) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strncmp_power8): \ + cfi_startproc; \ + LOCALENTRY(__strncmp_power8) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strncmp_power8) \ + END_2(__strncmp_power8) + + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/power8/strncmp.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c index 9829d69395..5e767839b9 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c @@ -25,13 +25,16 @@ extern __typeof (strncmp) __strncmp_ppc attribute_hidden; extern __typeof (strncmp) __strncmp_power4 attribute_hidden; extern __typeof (strncmp) __strncmp_power7 attribute_hidden; +extern __typeof (strncmp) __strncmp_power8 attribute_hidden; /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ libc_ifunc (strncmp, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __strncmp_power7 : - (hwcap & PPC_FEATURE_POWER4) + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strncmp_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strncmp_power7 : + (hwcap & PPC_FEATURE_POWER4) ? __strncmp_power4 : __strncmp_ppc); #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S index ead4a9afbe..ed906a4394 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S @@ -1,5 +1,5 @@ -/* Optimized strncat implementation for POWER7. - Copyright (C) 2014 Free Software Foundation, Inc. +/* Optimized strncpy implementation for POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -21,22 +21,20 @@ #undef EALIGN #define EALIGN(name, alignt, words) \ .section ".text"; \ - ENTRY_2(__strncat_power7) \ + ENTRY_2(__strncpy_power8) \ .align ALIGNARG(alignt); \ EALIGN_W_##words; \ - BODY_LABEL(__strncat_power7): \ + BODY_LABEL(__strncpy_power8): \ cfi_startproc; \ - LOCALENTRY(__strncat_power7) + LOCALENTRY(__strncpy_power8) #undef END #define END(name) \ cfi_endproc; \ - TRACEBACK(__strncat_power7) \ - END_2(__strncat_power7) + TRACEBACK(__strncpy_power8) \ + END_2(__strncpy_power8) #undef libc_hidden_builtin_def #define libc_hidden_builtin_def(name) -#define STRLEN __strlen_power7 - -#include <sysdeps/powerpc/powerpc64/power7/strncat.S> +#include <sysdeps/powerpc/powerpc64/power8/strncpy.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c index 8fd5e4b0c8..19927bc68c 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c @@ -24,12 +24,15 @@ extern __typeof (strncpy) __strncpy_ppc attribute_hidden; extern __typeof (strncpy) __strncpy_power7 attribute_hidden; +extern __typeof (strncpy) __strncpy_power8 attribute_hidden; /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle ifunc symbol properly. */ libc_ifunc (strncpy, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __strncpy_power7 + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strncpy_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strncpy_power7 : __strncpy_ppc); #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c deleted file mode 100644 index d543772a97..0000000000 --- a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright (C) 2014 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <string.h> - -#define STRSPN __strspn_ppc -#undef weak_alias -#define weak_alias(name, aliasname) \ - extern __typeof (__strspn_ppc) aliasname \ - __attribute__ ((weak, alias ("__strspn_ppc"))); -#if !defined(NOT_IN_libc) && defined(SHARED) -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) \ - __hidden_ver1(__strspn_ppc, __GI_strspn, __strspn_ppc); -#endif - -extern __typeof (strspn) __strspn_ppc attribute_hidden; - -#include <string/strspn.c> diff --git a/sysdeps/powerpc/powerpc64/power4/memset.S b/sysdeps/powerpc/powerpc64/power4/memset.S index 3a1e9dc76a..b433d49be8 100644 --- a/sysdeps/powerpc/powerpc64/power4/memset.S +++ b/sysdeps/powerpc/powerpc64/power4/memset.S @@ -235,7 +235,6 @@ L(medium_28t): END_GEN_TB (memset,TB_TOCLESS) libc_hidden_builtin_def (memset) -#ifndef NO_BZERO_IMPL /* Copied from bzero.S to prevent the linker from inserting a stub between bzero and memset. */ ENTRY (__bzero) @@ -243,7 +242,7 @@ ENTRY (__bzero) mr r5,r4 li r4,0 b L(_memset) -END_GEN_TB (__bzero,TB_TOCLESS) - +END (__bzero) +#ifndef __bzero weak_alias (__bzero, bzero) #endif diff --git a/sysdeps/powerpc/powerpc64/power6/memset.S b/sysdeps/powerpc/powerpc64/power6/memset.S index b5115a7989..6fffe0ec66 100644 --- a/sysdeps/powerpc/powerpc64/power6/memset.S +++ b/sysdeps/powerpc/powerpc64/power6/memset.S @@ -379,7 +379,6 @@ L(medium_28t): END_GEN_TB (memset,TB_TOCLESS) libc_hidden_builtin_def (memset) -#ifndef NO_BZERO_IMPL /* Copied from bzero.S to prevent the linker from inserting a stub between bzero and memset. */ ENTRY (__bzero) @@ -387,7 +386,7 @@ ENTRY (__bzero) mr r5,r4 li r4,0 b L(_memset) -END_GEN_TB (__bzero,TB_TOCLESS) - +END (__bzero) +#ifndef __bzero weak_alias (__bzero, bzero) #endif diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S index 09bff696ff..98b9e54ea9 100644 --- a/sysdeps/powerpc/powerpc64/power7/memcmp.S +++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S @@ -26,18 +26,48 @@ EALIGN (memcmp, 4, 0) CALL_MCOUNT 3 -#define rRTN r3 -#define rSTR1 r3 /* first string arg */ -#define rSTR2 r4 /* second string arg */ -#define rN r5 /* max string length */ -#define rWORD1 r6 /* current word in s1 */ -#define rWORD2 r7 /* current word in s2 */ -#define rWORD3 r8 /* next word in s1 */ -#define rWORD4 r9 /* next word in s2 */ -#define rWORD5 r10 /* next word in s1 */ -#define rWORD6 r11 /* next word in s2 */ -#define rWORD7 r30 /* next word in s1 */ -#define rWORD8 r31 /* next word in s2 */ +#define rRTN r3 +#define rSTR1 r3 /* first string arg */ +#define rSTR2 r4 /* second string arg */ +#define rN r5 /* max string length */ +#define rWORD1 r6 /* current word in s1 */ +#define rWORD2 r7 /* current word in s2 */ +#define rWORD3 r8 /* next word in s1 */ +#define rWORD4 r9 /* next word in s2 */ +#define rWORD5 r10 /* next word in s1 */ +#define rWORD6 r11 /* next word in s2 */ + +#define rOFF8 r20 /* 8 bytes offset. */ +#define rOFF16 r21 /* 16 bytes offset. */ +#define rOFF24 r22 /* 24 bytes offset. */ +#define rOFF32 r23 /* 24 bytes offset. */ +#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ +#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ +#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ +#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ +#define rSHR r28 /* Unaligned shift right count. */ +#define rSHL r29 /* Unaligned shift left count. */ +#define rWORD7 r30 /* next word in s1 */ +#define rWORD8 r31 /* next word in s2 */ + +#define rWORD8SAVE (-8) +#define rWORD7SAVE (-16) +#define rOFF8SAVE (-24) +#define rOFF16SAVE (-32) +#define rOFF24SAVE (-40) +#define rOFF32SAVE (-48) +#define rSHRSAVE (-56) +#define rSHLSAVE (-64) +#define rWORD8SHIFTSAVE (-72) +#define rWORD2SHIFTSAVE (-80) +#define rWORD4SHIFTSAVE (-88) +#define rWORD6SHIFTSAVE (-96) + +#ifdef __LITTLE_ENDIAN__ +# define LD ldbrx +#else +# define LD ldx +#endif xor r0, rSTR2, rSTR1 cmpldi cr6, rN, 0 @@ -51,10 +81,24 @@ EALIGN (memcmp, 4, 0) /* If less than 8 bytes or not aligned, use the unaligned byte loop. */ blt cr1, L(bytealigned) - std rWORD8, -8(r1) - cfi_offset(rWORD8, -8) - std rWORD7, -16(r1) - cfi_offset(rWORD7, -16) + std rWORD8, rWORD8SAVE(r1) + cfi_offset(rWORD8, rWORD8SAVE) + std rWORD7, rWORD7SAVE(r1) + cfi_offset(rWORD7, rWORD7SAVE) + std rOFF8, rOFF8SAVE(r1) + cfi_offset(rWORD7, rOFF8SAVE) + std rOFF16, rOFF16SAVE(r1) + cfi_offset(rWORD7, rOFF16SAVE) + std rOFF24, rOFF24SAVE(r1) + cfi_offset(rWORD7, rOFF24SAVE) + std rOFF32, rOFF32SAVE(r1) + cfi_offset(rWORD7, rOFF32SAVE) + + li rOFF8,8 + li rOFF16,16 + li rOFF24,24 + li rOFF32,32 + bne L(unaligned) /* At this point we know both strings have the same alignment and the compare length is at least 8 bytes. r12 contains the low order @@ -79,15 +123,8 @@ L(samealignment): sldi rWORD6, r12, 3 srdi r0, rN, 5 /* Divide by 32 */ andi. r12, rN, 24 /* Get the DW remainder */ -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 0(rSTR1) - ld rWORD2, 0(rSTR2) -#endif + LD rWORD1, 0, rSTR1 + LD rWORD2, 0, rSTR2 cmpldi cr1, r12, 16 cmpldi cr7, rN, 32 clrldi rN, rN, 61 @@ -104,15 +141,8 @@ L(dsP1): cmpld cr5, rWORD5, rWORD6 blt cr7, L(dP1x) /* Do something useful in this cycle since we have to branch anyway. */ -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) -#endif + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 cmpld cr7, rWORD1, rWORD2 b L(dP1e) /* Remainder is 16 */ @@ -123,15 +153,8 @@ L(dPs2): cmpld cr6, rWORD5, rWORD6 blt cr7, L(dP2x) /* Do something useful in this cycle since we have to branch anyway. */ -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - ldbrx rWORD8, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD7, 8(rSTR1) - ld rWORD8, 8(rSTR2) -#endif + LD rWORD7, rOFF8, rSTR1 + LD rWORD8, rOFF8, rSTR2 cmpld cr5, rWORD7, rWORD8 b L(dP2e) /* Remainder is 24 */ @@ -173,72 +196,43 @@ L(dP1): change any on the early exit path. The key here is the non-early exit path only cares about the condition code (cr5), not about which register pair was used. */ -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - ldbrx rWORD6, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD5, 0(rSTR1) - ld rWORD6, 0(rSTR2) -#endif + LD rWORD5, 0, rSTR1 + LD rWORD6, 0, rSTR2 cmpld cr5, rWORD5, rWORD6 blt cr7, L(dP1x) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) -#endif + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 cmpld cr7, rWORD1, rWORD2 L(dP1e): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - ldbrx rWORD4, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD3, 16(rSTR1) - ld rWORD4, 16(rSTR2) -#endif + LD rWORD3, rOFF16, rSTR1 + LD rWORD4, rOFF16, rSTR2 cmpld cr1, rWORD3, rWORD4 -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - ldbrx rWORD6, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD5, 24(rSTR1) - ld rWORD6, 24(rSTR2) -#endif + LD rWORD5, rOFF24, rSTR1 + LD rWORD6, rOFF24, rSTR2 cmpld cr6, rWORD5, rWORD6 bne cr5, L(dLcr5x) bne cr7, L(dLcr7x) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - ldbrx rWORD8, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ldu rWORD7, 32(rSTR1) - ldu rWORD8, 32(rSTR2) -#endif + LD rWORD7, rOFF32, rSTR1 + LD rWORD8, rOFF32, rSTR2 + addi rSTR1, rSTR1, 32 + addi rSTR2, rSTR2, 32 bne cr1, L(dLcr1) cmpld cr5, rWORD7, rWORD8 bdnz L(dLoop) bne cr6, L(dLcr6) - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) .align 3 L(dP1x): sldi. r12, rN, 3 bne cr5, L(dLcr5x) subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ bne L(d00) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) li rRTN, 0 blr @@ -246,79 +240,41 @@ L(dP1x): .align 4 L(dP2): mtctr r0 -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - ldbrx rWORD6, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD5, 0(rSTR1) - ld rWORD6, 0(rSTR2) -#endif + LD rWORD5, 0, rSTR1 + LD rWORD6, 0, rSTR2 cmpld cr6, rWORD5, rWORD6 blt cr7, L(dP2x) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - ldbrx rWORD8, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD7, 8(rSTR1) - ld rWORD8, 8(rSTR2) -#endif + LD rWORD7, rOFF8, rSTR1 + LD rWORD8, rOFF8, rSTR2 cmpld cr5, rWORD7, rWORD8 L(dP2e): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 16(rSTR1) - ld rWORD2, 16(rSTR2) -#endif + LD rWORD1, rOFF16, rSTR1 + LD rWORD2, rOFF16, rSTR2 cmpld cr7, rWORD1, rWORD2 -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - ldbrx rWORD4, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD3, 24(rSTR1) - ld rWORD4, 24(rSTR2) -#endif + LD rWORD3, rOFF24, rSTR1 + LD rWORD4, rOFF24, rSTR2 cmpld cr1, rWORD3, rWORD4 -#ifndef __LITTLE_ENDIAN__ addi rSTR1, rSTR1, 8 addi rSTR2, rSTR2, 8 -#endif bne cr6, L(dLcr6) bne cr5, L(dLcr5) b L(dLoop2) -/* Again we are on a early exit path (16-23 byte compare), we want to - only use volatile registers and avoid restoring non-volatile - registers. */ .align 4 L(dP2x): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - ldbrx rWORD4, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD3, 8(rSTR1) - ld rWORD4, 8(rSTR2) -#endif + LD rWORD3, rOFF8, rSTR1 + LD rWORD4, rOFF8, rSTR2 cmpld cr1, rWORD3, rWORD4 sldi. r12, rN, 3 bne cr6, L(dLcr6x) -#ifndef __LITTLE_ENDIAN__ addi rSTR1, rSTR1, 8 addi rSTR2, rSTR2, 8 -#endif bne cr1, L(dLcr1x) subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ bne L(d00) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) li rRTN, 0 blr @@ -326,52 +282,22 @@ L(dP2x): .align 4 L(dP3): mtctr r0 -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - ldbrx rWORD4, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD3, 0(rSTR1) - ld rWORD4, 0(rSTR2) -#endif + LD rWORD3, 0, rSTR1 + LD rWORD4, 0, rSTR2 cmpld cr1, rWORD3, rWORD4 L(dP3e): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - ldbrx rWORD6, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD5, 8(rSTR1) - ld rWORD6, 8(rSTR2) -#endif + LD rWORD5, rOFF8, rSTR1 + LD rWORD6, rOFF8, rSTR2 cmpld cr6, rWORD5, rWORD6 blt cr7, L(dP3x) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - ldbrx rWORD8, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD7, 16(rSTR1) - ld rWORD8, 16(rSTR2) -#endif + LD rWORD7, rOFF16, rSTR1 + LD rWORD8, rOFF16, rSTR2 cmpld cr5, rWORD7, rWORD8 -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 24(rSTR1) - ld rWORD2, 24(rSTR2) -#endif + LD rWORD1, rOFF24, rSTR1 + LD rWORD2, rOFF24, rSTR2 cmpld cr7, rWORD1, rWORD2 -#ifndef __LITTLE_ENDIAN__ addi rSTR1, rSTR1, 16 addi rSTR2, rSTR2, 16 -#endif bne cr1, L(dLcr1) bne cr6, L(dLcr6) b L(dLoop1) @@ -380,26 +306,21 @@ L(dP3e): registers. */ .align 4 L(dP3x): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 16(rSTR1) - ld rWORD2, 16(rSTR2) -#endif + LD rWORD1, rOFF16, rSTR1 + LD rWORD2, rOFF16, rSTR2 cmpld cr7, rWORD1, rWORD2 sldi. r12, rN, 3 bne cr1, L(dLcr1x) -#ifndef __LITTLE_ENDIAN__ addi rSTR1, rSTR1, 16 addi rSTR2, rSTR2, 16 -#endif bne cr6, L(dLcr6x) subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ bne cr7, L(dLcr7x) bne L(d00) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) li rRTN, 0 blr @@ -407,46 +328,20 @@ L(dP3x): .align 4 L(dP4): mtctr r0 -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 0(rSTR1) - ld rWORD2, 0(rSTR2) -#endif + LD rWORD1, 0, rSTR1 + LD rWORD2, 0, rSTR2 cmpld cr7, rWORD1, rWORD2 L(dP4e): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - ldbrx rWORD4, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD3, 8(rSTR1) - ld rWORD4, 8(rSTR2) -#endif + LD rWORD3, rOFF8, rSTR1 + LD rWORD4, rOFF8, rSTR2 cmpld cr1, rWORD3, rWORD4 -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - ldbrx rWORD6, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD5, 16(rSTR1) - ld rWORD6, 16(rSTR2) -#endif + LD rWORD5, rOFF16, rSTR1 + LD rWORD6, rOFF16, rSTR2 cmpld cr6, rWORD5, rWORD6 -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - ldbrx rWORD8, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ldu rWORD7, 24(rSTR1) - ldu rWORD8, 24(rSTR2) -#endif + LD rWORD7, rOFF24, rSTR1 + LD rWORD8, rOFF24, rSTR2 + addi rSTR1, rSTR1, 24 + addi rSTR2, rSTR2, 24 cmpld cr5, rWORD7, rWORD8 bne cr7, L(dLcr7) bne cr1, L(dLcr1) @@ -454,51 +349,25 @@ L(dP4e): /* This is the primary loop */ .align 4 L(dLoop): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) -#endif + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 cmpld cr1, rWORD3, rWORD4 bne cr6, L(dLcr6) L(dLoop1): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - ldbrx rWORD4, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD3, 16(rSTR1) - ld rWORD4, 16(rSTR2) -#endif + LD rWORD3, rOFF16, rSTR1 + LD rWORD4, rOFF16, rSTR2 cmpld cr6, rWORD5, rWORD6 bne cr5, L(dLcr5) L(dLoop2): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - ldbrx rWORD6, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD5, 24(rSTR1) - ld rWORD6, 24(rSTR2) -#endif + LD rWORD5, rOFF24, rSTR1 + LD rWORD6, rOFF24, rSTR2 cmpld cr5, rWORD7, rWORD8 bne cr7, L(dLcr7) L(dLoop3): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - ldbrx rWORD8, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ldu rWORD7, 32(rSTR1) - ldu rWORD8, 32(rSTR2) -#endif + LD rWORD7, rOFF32, rSTR1 + LD rWORD8, rOFF32, rSTR2 + addi rSTR1, rSTR1, 32 + addi rSTR2, rSTR2, 32 bne cr1, L(dLcr1) cmpld cr7, rWORD1, rWORD2 bdnz L(dLoop) @@ -519,62 +388,75 @@ L(d14): sldi. r12, rN, 3 bne cr5, L(dLcr5) L(d04): - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */ - beq L(zeroLength) + beq L(duzeroLength) /* At this point we have a remainder of 1 to 7 bytes to compare. Since we are aligned it is safe to load the whole double word, and use shift right double to eliminate bits beyond the compare length. */ L(d00): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) -#endif + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 srd rWORD1, rWORD1, rN srd rWORD2, rWORD2, rN cmpld cr7, rWORD1, rWORD2 bne cr7, L(dLcr7x) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) li rRTN, 0 blr .align 4 L(dLcr7): - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) L(dLcr7x): + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) li rRTN, 1 bgtlr cr7 li rRTN, -1 blr .align 4 L(dLcr1): - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) L(dLcr1x): + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) li rRTN, 1 bgtlr cr1 li rRTN, -1 blr .align 4 L(dLcr6): - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) L(dLcr6x): + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) li rRTN, 1 bgtlr cr6 li rRTN, -1 blr .align 4 L(dLcr5): - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) L(dLcr5x): + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) li rRTN, 1 bgtlr cr5 li rRTN, -1 @@ -583,10 +465,6 @@ L(dLcr5x): .align 4 L(bytealigned): mtctr rN -#if 0 -/* Huh? We've already branched on cr6! */ - beq cr6, L(zeroLength) -#endif /* We need to prime this loop. This loop is swing modulo scheduled to avoid pipe delays. The dependent instruction latencies (load to @@ -685,6 +563,7 @@ L(b11): L(bx12): sub rRTN, rWORD1, rWORD2 blr + .align 4 L(zeroLength): li rRTN, 0 @@ -705,42 +584,36 @@ L(zeroLength): we need to adjust the length (rN) and special case the loop versioning for the first DW. This ensures that the loop count is correct and the first DW (shifted) is in the expected resister pair. */ -#define rSHL r29 /* Unaligned shift left count. */ -#define rSHR r28 /* Unaligned shift right count. */ -#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */ -#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */ -#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */ -#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */ L(unaligned): - std rSHL, -24(r1) - cfi_offset(rSHL, -24) + std rSHL, rSHLSAVE(r1) + cfi_offset(rSHL, rSHLSAVE) clrldi rSHL, rSTR2, 61 beq cr6, L(duzeroLength) - std rSHR, -32(r1) - cfi_offset(rSHR, -32) + std rSHR, rSHRSAVE(r1) + cfi_offset(rSHR, rSHRSAVE) beq cr5, L(DWunaligned) - std rWORD8_SHIFT, -40(r1) - cfi_offset(rWORD8_SHIFT, -40) + std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) + cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) /* Adjust the logical start of rSTR2 to compensate for the extra bits in the 1st rSTR1 DW. */ sub rWORD8_SHIFT, rSTR2, r12 /* But do not attempt to address the DW before that DW that contains the actual start of rSTR2. */ clrrdi rSTR2, rSTR2, 3 - std rWORD2_SHIFT, -48(r1) - cfi_offset(rWORD2_SHIFT, -48) + std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) + cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) /* Compute the left/right shift counts for the unaligned rSTR2, compensating for the logical (DW aligned) start of rSTR1. */ clrldi rSHL, rWORD8_SHIFT, 61 clrrdi rSTR1, rSTR1, 3 - std rWORD4_SHIFT, -56(r1) - cfi_offset(rWORD4_SHIFT, -56) + std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) + cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) sldi rSHL, rSHL, 3 cmpld cr5, rWORD8_SHIFT, rSTR2 add rN, rN, r12 sldi rWORD6, r12, 3 - std rWORD6_SHIFT, -64(r1) - cfi_offset(rWORD6_SHIFT, -64) + std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) + cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) subfic rSHR, rSHL, 64 srdi r0, rN, 5 /* Divide by 32 */ andi. r12, rN, 24 /* Get the DW remainder */ @@ -750,25 +623,13 @@ L(unaligned): this may cross a page boundary and cause a page fault. */ li rWORD8, 0 blt cr5, L(dus0) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD8, 0, rSTR2 + LD rWORD8, 0, rSTR2 addi rSTR2, rSTR2, 8 -#else - ld rWORD8, 0(rSTR2) - addi rSTR2, rSTR2, 8 -#endif sld rWORD8, rWORD8, rSHL L(dus0): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 0(rSTR1) - ld rWORD2, 0(rSTR2) -#endif + LD rWORD1, 0, rSTR1 + LD rWORD2, 0, rSTR2 cmpldi cr1, r12, 16 cmpldi cr7, rN, 32 srd r12, rWORD2, rSHR @@ -796,12 +657,7 @@ L(dusP1): beq L(duZeroReturn) li r0, 0 ble cr7, L(dutrim) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD2, 0, rSTR2 - addi rSTR2, rSTR2, 8 -#else - ld rWORD2, 8(rSTR2) -#endif + LD rWORD2, rOFF8, rSTR2 srd r0, rWORD2, rSHR b L(dutrim) /* Remainder is 16 */ @@ -832,27 +688,21 @@ L(duPs4): compare length is at least 8 bytes. */ .align 4 L(DWunaligned): - std rWORD8_SHIFT, -40(r1) - cfi_offset(rWORD8_SHIFT, -40) + std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) + cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE) clrrdi rSTR2, rSTR2, 3 - std rWORD2_SHIFT, -48(r1) - cfi_offset(rWORD2_SHIFT, -48) + std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) + cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE) srdi r0, rN, 5 /* Divide by 32 */ - std rWORD4_SHIFT, -56(r1) - cfi_offset(rWORD4_SHIFT, -56) + std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) + cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE) andi. r12, rN, 24 /* Get the DW remainder */ - std rWORD6_SHIFT, -64(r1) - cfi_offset(rWORD6_SHIFT, -64) + std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) + cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE) sldi rSHL, rSHL, 3 -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD6, 0, rSTR2 + LD rWORD6, 0, rSTR2 + LD rWORD8, rOFF8, rSTR2 addi rSTR2, rSTR2, 8 - ldbrx rWORD8, 0, rSTR2 - addi rSTR2, rSTR2, 8 -#else - ld rWORD6, 0(rSTR2) - ldu rWORD8, 8(rSTR2) -#endif cmpldi cr1, r12, 16 cmpldi cr7, rN, 32 clrldi rN, rN, 61 @@ -867,52 +717,26 @@ L(DWunaligned): .align 4 L(duP1): srd r12, rWORD8, rSHR -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - addi rSTR1, rSTR1, 8 -#else - ld rWORD7, 0(rSTR1) -#endif + LD rWORD7, 0, rSTR1 sld rWORD8_SHIFT, rWORD8, rSHL or rWORD8, r12, rWORD6_SHIFT blt cr7, L(duP1x) L(duP1e): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) -#endif + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 cmpld cr5, rWORD7, rWORD8 srd r0, rWORD2, rSHR sld rWORD2_SHIFT, rWORD2, rSHL or rWORD2, r0, rWORD8_SHIFT -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - ldbrx rWORD4, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD3, 16(rSTR1) - ld rWORD4, 16(rSTR2) -#endif + LD rWORD3, rOFF16, rSTR1 + LD rWORD4, rOFF16, rSTR2 cmpld cr7, rWORD1, rWORD2 srd r12, rWORD4, rSHR sld rWORD4_SHIFT, rWORD4, rSHL bne cr5, L(duLcr5) or rWORD4, r12, rWORD2_SHIFT -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - ldbrx rWORD6, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD5, 24(rSTR1) - ld rWORD6, 24(rSTR2) -#endif + LD rWORD5, rOFF24, rSTR1 + LD rWORD6, rOFF24, rSTR2 cmpld cr1, rWORD3, rWORD4 srd r0, rWORD6, rSHR sld rWORD6_SHIFT, rWORD6, rSHL @@ -932,82 +756,47 @@ L(duP1x): beq L(duZeroReturn) li r0, 0 ble cr7, L(dutrim) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD2, 0, rSTR2 - addi rSTR2, rSTR2, 8 -#else - ld rWORD2, 8(rSTR2) -#endif + LD rWORD2, rOFF8, rSTR2 srd r0, rWORD2, rSHR b L(dutrim) /* Remainder is 16 */ .align 4 L(duP2): srd r0, rWORD8, rSHR -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - addi rSTR1, rSTR1, 8 -#else - ld rWORD5, 0(rSTR1) -#endif + LD rWORD5, 0, rSTR1 or rWORD6, r0, rWORD6_SHIFT sld rWORD6_SHIFT, rWORD8, rSHL L(duP2e): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - ldbrx rWORD8, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD7, 8(rSTR1) - ld rWORD8, 8(rSTR2) -#endif + LD rWORD7, rOFF8, rSTR1 + LD rWORD8, rOFF8, rSTR2 cmpld cr6, rWORD5, rWORD6 srd r12, rWORD8, rSHR sld rWORD8_SHIFT, rWORD8, rSHL or rWORD8, r12, rWORD6_SHIFT blt cr7, L(duP2x) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 16(rSTR1) - ld rWORD2, 16(rSTR2) -#endif + LD rWORD1, rOFF16, rSTR1 + LD rWORD2, rOFF16, rSTR2 cmpld cr5, rWORD7, rWORD8 bne cr6, L(duLcr6) srd r0, rWORD2, rSHR sld rWORD2_SHIFT, rWORD2, rSHL or rWORD2, r0, rWORD8_SHIFT -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - ldbrx rWORD4, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD3, 24(rSTR1) - ld rWORD4, 24(rSTR2) -#endif + LD rWORD3, rOFF24, rSTR1 + LD rWORD4, rOFF24, rSTR2 cmpld cr7, rWORD1, rWORD2 bne cr5, L(duLcr5) srd r12, rWORD4, rSHR sld rWORD4_SHIFT, rWORD4, rSHL or rWORD4, r12, rWORD2_SHIFT -#ifndef __LITTLE_ENDIAN__ addi rSTR1, rSTR1, 8 addi rSTR2, rSTR2, 8 -#endif cmpld cr1, rWORD3, rWORD4 b L(duLoop2) .align 4 L(duP2x): cmpld cr5, rWORD7, rWORD8 -#ifndef __LITTLE_ENDIAN__ addi rSTR1, rSTR1, 8 addi rSTR2, rSTR2, 8 -#endif bne cr6, L(duLcr6) sldi. rN, rN, 3 bne cr5, L(duLcr5) @@ -1015,12 +804,7 @@ L(duP2x): beq L(duZeroReturn) li r0, 0 ble cr7, L(dutrim) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD2, 0, rSTR2 - addi rSTR2, rSTR2, 8 -#else - ld rWORD2, 8(rSTR2) -#endif + LD rWORD2, rOFF8, rSTR2 srd r0, rWORD2, rSHR b L(dutrim) @@ -1028,73 +812,39 @@ L(duP2x): .align 4 L(duP3): srd r12, rWORD8, rSHR -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - addi rSTR1, rSTR1, 8 -#else - ld rWORD3, 0(rSTR1) -#endif + LD rWORD3, 0, rSTR1 sld rWORD4_SHIFT, rWORD8, rSHL or rWORD4, r12, rWORD6_SHIFT L(duP3e): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - ldbrx rWORD6, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD5, 8(rSTR1) - ld rWORD6, 8(rSTR2) -#endif + LD rWORD5, rOFF8, rSTR1 + LD rWORD6, rOFF8, rSTR2 cmpld cr1, rWORD3, rWORD4 srd r0, rWORD6, rSHR sld rWORD6_SHIFT, rWORD6, rSHL or rWORD6, r0, rWORD4_SHIFT -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - ldbrx rWORD8, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD7, 16(rSTR1) - ld rWORD8, 16(rSTR2) -#endif + LD rWORD7, rOFF16, rSTR1 + LD rWORD8, rOFF16, rSTR2 cmpld cr6, rWORD5, rWORD6 bne cr1, L(duLcr1) srd r12, rWORD8, rSHR sld rWORD8_SHIFT, rWORD8, rSHL or rWORD8, r12, rWORD6_SHIFT blt cr7, L(duP3x) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 24(rSTR1) - ld rWORD2, 24(rSTR2) -#endif + LD rWORD1, rOFF24, rSTR1 + LD rWORD2, rOFF24, rSTR2 cmpld cr5, rWORD7, rWORD8 bne cr6, L(duLcr6) srd r0, rWORD2, rSHR sld rWORD2_SHIFT, rWORD2, rSHL or rWORD2, r0, rWORD8_SHIFT -#ifndef __LITTLE_ENDIAN__ addi rSTR1, rSTR1, 16 addi rSTR2, rSTR2, 16 -#endif cmpld cr7, rWORD1, rWORD2 b L(duLoop1) .align 4 L(duP3x): -#ifndef __LITTLE_ENDIAN__ addi rSTR1, rSTR1, 16 addi rSTR2, rSTR2, 16 -#endif -#if 0 -/* Huh? We've already branched on cr1! */ - bne cr1, L(duLcr1) -#endif cmpld cr5, rWORD7, rWORD8 bne cr6, L(duLcr6) sldi. rN, rN, 3 @@ -1103,12 +853,7 @@ L(duP3x): beq L(duZeroReturn) li r0, 0 ble cr7, L(dutrim) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD2, 0, rSTR2 - addi rSTR2, rSTR2, 8 -#else - ld rWORD2, 8(rSTR2) -#endif + LD rWORD2, rOFF8, rSTR2 srd r0, rWORD2, rSHR b L(dutrim) @@ -1117,51 +862,27 @@ L(duP3x): L(duP4): mtctr r0 srd r0, rWORD8, rSHR -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - addi rSTR1, rSTR1, 8 -#else - ld rWORD1, 0(rSTR1) -#endif + LD rWORD1, 0, rSTR1 sld rWORD2_SHIFT, rWORD8, rSHL or rWORD2, r0, rWORD6_SHIFT L(duP4e): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - ldbrx rWORD4, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD3, 8(rSTR1) - ld rWORD4, 8(rSTR2) -#endif + LD rWORD3, rOFF8, rSTR1 + LD rWORD4, rOFF8, rSTR2 cmpld cr7, rWORD1, rWORD2 srd r12, rWORD4, rSHR sld rWORD4_SHIFT, rWORD4, rSHL or rWORD4, r12, rWORD2_SHIFT -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - ldbrx rWORD6, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD5, 16(rSTR1) - ld rWORD6, 16(rSTR2) -#endif + LD rWORD5, rOFF16, rSTR1 + LD rWORD6, rOFF16, rSTR2 cmpld cr1, rWORD3, rWORD4 bne cr7, L(duLcr7) srd r0, rWORD6, rSHR sld rWORD6_SHIFT, rWORD6, rSHL or rWORD6, r0, rWORD4_SHIFT -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - ldbrx rWORD8, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ldu rWORD7, 24(rSTR1) - ldu rWORD8, 24(rSTR2) -#endif + LD rWORD7, rOFF24, rSTR1 + LD rWORD8, rOFF24, rSTR2 + addi rSTR1, rSTR1, 24 + addi rSTR2, rSTR2, 24 cmpld cr6, rWORD5, rWORD6 bne cr1, L(duLcr1) srd r12, rWORD8, rSHR @@ -1172,60 +893,34 @@ L(duP4e): /* This is the primary loop */ .align 4 L(duLoop): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 - ldbrx rWORD2, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD1, 8(rSTR1) - ld rWORD2, 8(rSTR2) -#endif + LD rWORD1, rOFF8, rSTR1 + LD rWORD2, rOFF8, rSTR2 cmpld cr1, rWORD3, rWORD4 bne cr6, L(duLcr6) srd r0, rWORD2, rSHR sld rWORD2_SHIFT, rWORD2, rSHL or rWORD2, r0, rWORD8_SHIFT L(duLoop1): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD3, 0, rSTR1 - ldbrx rWORD4, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD3, 16(rSTR1) - ld rWORD4, 16(rSTR2) -#endif + LD rWORD3, rOFF16, rSTR1 + LD rWORD4, rOFF16, rSTR2 cmpld cr6, rWORD5, rWORD6 bne cr5, L(duLcr5) srd r12, rWORD4, rSHR sld rWORD4_SHIFT, rWORD4, rSHL or rWORD4, r12, rWORD2_SHIFT L(duLoop2): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD5, 0, rSTR1 - ldbrx rWORD6, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ld rWORD5, 24(rSTR1) - ld rWORD6, 24(rSTR2) -#endif + LD rWORD5, rOFF24, rSTR1 + LD rWORD6, rOFF24, rSTR2 cmpld cr5, rWORD7, rWORD8 bne cr7, L(duLcr7) srd r0, rWORD6, rSHR sld rWORD6_SHIFT, rWORD6, rSHL or rWORD6, r0, rWORD4_SHIFT L(duLoop3): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD7, 0, rSTR1 - ldbrx rWORD8, 0, rSTR2 - addi rSTR1, rSTR1, 8 - addi rSTR2, rSTR2, 8 -#else - ldu rWORD7, 32(rSTR1) - ldu rWORD8, 32(rSTR2) -#endif + LD rWORD7, rOFF32, rSTR1 + LD rWORD8, rOFF32, rSTR2 + addi rSTR1, rSTR1, 32 + addi rSTR2, rSTR2, 32 cmpld cr7, rWORD1, rWORD2 bne cr1, L(duLcr1) srd r12, rWORD8, rSHR @@ -1234,10 +929,6 @@ L(duLoop3): bdnz L(duLoop) L(duL4): -#if 0 -/* Huh? We've already branched on cr1! */ - bne cr1, L(duLcr1) -#endif cmpld cr1, rWORD3, rWORD4 bne cr6, L(duLcr6) cmpld cr6, rWORD5, rWORD6 @@ -1264,99 +955,102 @@ L(du14): beq L(duZeroReturn) li r0, 0 ble cr7, L(dutrim) -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD2, 0, rSTR2 - addi rSTR2, rSTR2, 8 -#else - ld rWORD2, 8(rSTR2) -#endif + LD rWORD2, rOFF8, rSTR2 srd r0, rWORD2, rSHR .align 4 L(dutrim): -#ifdef __LITTLE_ENDIAN__ - ldbrx rWORD1, 0, rSTR1 -#else - ld rWORD1, 8(rSTR1) -#endif + LD rWORD1, rOFF8, rSTR1 ld rWORD8, -8(r1) subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */ or rWORD2, r0, rWORD8_SHIFT - ld rWORD7, -16(r1) - ld rSHL, -24(r1) + ld rWORD7, rWORD7SAVE(r1) + ld rSHL, rSHLSAVE(r1) srd rWORD1, rWORD1, rN srd rWORD2, rWORD2, rN - ld rSHR, -32(r1) - ld rWORD8_SHIFT, -40(r1) + ld rSHR, rSHRSAVE(r1) + ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) li rRTN, 0 cmpld cr7, rWORD1, rWORD2 - ld rWORD2_SHIFT, -48(r1) - ld rWORD4_SHIFT, -56(r1) + ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) + ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) beq cr7, L(dureturn24) li rRTN, 1 - ld rWORD6_SHIFT, -64(r1) + ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) bgtlr cr7 li rRTN, -1 blr .align 4 L(duLcr7): - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) li rRTN, 1 bgt cr7, L(dureturn29) - ld rSHL, -24(r1) - ld rSHR, -32(r1) + ld rSHL, rSHLSAVE(r1) + ld rSHR, rSHRSAVE(r1) li rRTN, -1 b L(dureturn27) .align 4 L(duLcr1): - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) li rRTN, 1 bgt cr1, L(dureturn29) - ld rSHL, -24(r1) - ld rSHR, -32(r1) + ld rSHL, rSHLSAVE(r1) + ld rSHR, rSHRSAVE(r1) li rRTN, -1 b L(dureturn27) .align 4 L(duLcr6): - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) li rRTN, 1 bgt cr6, L(dureturn29) - ld rSHL, -24(r1) - ld rSHR, -32(r1) + ld rSHL, rSHLSAVE(r1) + ld rSHR, rSHRSAVE(r1) li rRTN, -1 b L(dureturn27) .align 4 L(duLcr5): - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) li rRTN, 1 bgt cr5, L(dureturn29) - ld rSHL, -24(r1) - ld rSHR, -32(r1) + ld rSHL, rSHLSAVE(r1) + ld rSHR, rSHRSAVE(r1) li rRTN, -1 b L(dureturn27) + .align 3 L(duZeroReturn): li rRTN, 0 .align 4 L(dureturn): - ld rWORD8, -8(r1) - ld rWORD7, -16(r1) + ld rWORD8, rWORD8SAVE(r1) + ld rWORD7, rWORD7SAVE(r1) L(dureturn29): - ld rSHL, -24(r1) - ld rSHR, -32(r1) + ld rSHL, rSHLSAVE(r1) + ld rSHR, rSHRSAVE(r1) L(dureturn27): - ld rWORD8_SHIFT, -40(r1) -L(dureturn26): - ld rWORD2_SHIFT, -48(r1) -L(dureturn25): - ld rWORD4_SHIFT, -56(r1) + ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1) + ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1) + ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1) L(dureturn24): - ld rWORD6_SHIFT, -64(r1) + ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1) + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) blr + L(duzeroLength): + ld rOFF8, rOFF8SAVE(r1) + ld rOFF16, rOFF16SAVE(r1) + ld rOFF24, rOFF24SAVE(r1) + ld rOFF32, rOFF32SAVE(r1) li rRTN, 0 blr diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S index 6b8999dc1f..14df042785 100644 --- a/sysdeps/powerpc/powerpc64/power7/memset.S +++ b/sysdeps/powerpc/powerpc64/power7/memset.S @@ -383,7 +383,6 @@ L(small): END_GEN_TB (memset,TB_TOCLESS) libc_hidden_builtin_def (memset) -#ifndef NO_BZERO_IMPL /* Copied from bzero.S to prevent the linker from inserting a stub between bzero and memset. */ ENTRY (__bzero) @@ -391,7 +390,7 @@ ENTRY (__bzero) mr r5,r4 li r4,0 b L(_memset) -END_GEN_TB (__bzero,TB_TOCLESS) - +END (__bzero) +#ifndef __bzero weak_alias (__bzero, bzero) #endif diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S index f16a9d8a88..ade2811a6e 100644 --- a/sysdeps/powerpc/powerpc64/power7/strcmp.S +++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S @@ -25,122 +25,96 @@ /* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */ + .machine power7 EALIGN (strcmp, 4, 0) CALL_MCOUNT 2 or r9, r3, r4 rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */ bne cr0, L(process_unaligned_bytes) + li r5, 0 + .align 4 /* process input parameters on double word aligned boundary */ - ld r9, 0(r4) /* load s2 at offset=0 */ - li r10, 0 /* load mask=0 */ - cmpb r10, r9, r10 /* compare bytes at s2 with mask */ - cmpdi cr7, r10, 0 /* is NULL found ..? is end of string HIT */ - bne cr7, L(process_unaligned_bytes) /* process byte by byte */ - - ld r10, 0(r3) /* load s1 at offset=0 */ - li r8, 0 /* load mask=0 */ - cmpb r8, r10, r8 /* compare bytes at s1 with mask */ - cmpdi cr7, r8, 0 /* is NULL found ..? is end of string HIT */ - bne cr7, L(process_unaligned_bytes) /* process byte by byte */ - -/*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO */ - cmpb r9, r10, r9 /* compare s1 and s2 */ - cmpdi cr7, r9, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ - bne cr7, L(process_unaligned_bytes) /* s1,s2 mismatch found */ - - addi r5, r3, 8 /* save next offset of s2 */ - addi r11, r4, 8 /* save next offset of s1 */ - ld r8, 8(r4) /* load s2 at offset=8 */ - li r9, 0 /* load mask=0 */ - cmpb r9, r8, r9 /* compare bytes at s2 with mask */ - cmpdi cr7, r9, 0 /* NULL found ..? */ - bne cr7, L(processBytes)/* update input and process bytes one by one */ - - mr r9, r4 /* save s2 */ - li r10, 0 /* load mask=0 */ - - ld r7, 8(r3) /* load s1 at offset=8 */ - cmpb r6, r7, r10 /* compare bytes at s1 with mask */ - cmpdi cr7, r6, 0 /* is NULL found */ - bne cr7, L(processBytes)/* mismatch, so process one by one */ - L(unrollDword): - cmpb r8, r7, r8 /* compare s1 and s2 */ - cmpdi cr7, r8, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */ - bne cr7, L(processBytes)/* mismatch with s1 and s2 */ - - addi r5, r3, 16 /* save offset=16 of s1 */ - addi r4, r9, 16 /* save offset=16 of s2 */ - ld r8, 16(r9) /* load s2 at offset=16 */ - cmpb r7, r8, r10 /* compare bytes at s2 with mask */ - cmpdi cr7, r7, 0 /* NULL found ..? */ - bne cr7, L(update2processBytes) - - ld r7, 16(r3) /* load s1 at offset=16 */ - cmpb r6, r7, r10 /* check s1 for end of string */ - cmpdi cr7, r6, 0 /* end of s1 ?,then handle byte by byte */ - bne 7,L(update2processBytes) - - cmpb r8, r7, r8 /* compare s1 and s2 double words */ - cmpdi cr7, r8, -1 /* compare results with 0xFFFFFFFFFFFFFFFF */ - bne cr7,L(update2processBytes) - - addi r5, r3, 24 /* update s1 to offset=24 */ - addi r4, r9, 24 /* update s2 to offset=24 */ - - ld r8, 24(r9) /* load s2 */ - cmpb r7, r8, r10 /* compare s2 for NULL */ - cmpdi cr7, r7, 0 /* verify if s2 is ending now */ - bne cr7,L(update2processBytes) - - ld r7, 24(r3) /* load s1 at offset=24 */ - cmpb r6, r7, r10 /* verify for NULL */ - cmpdi cr7, r6, 0 /* is NULL found */ - bne cr7, L(update2processBytes) - - cmpb r8, r7, r8 /* compare s1 and s2 */ - cmpdi cr7, r8, -1 /* are s1 and s2 same ..? */ - bne cr7, L(update2processBytes) - - addi r7, r9, 32 /* update s2 to next double word */ - addi r3, r3, 32 /* update s1 to next double word */ - - ld r8, 32(r9) /* load s2 */ - mr r4, r7 /* save s2 */ - cmpb r6, r8, r10 /* compare s2 with NULL */ - cmpdi cr7, r6, 0 /* end of s2 ..? */ - bne cr7, L(process_unaligned_bytes) - - ld r6, 0(r3) /* load and compare s1 for NULL */ - cmpb r5, r6, r10 - cmpdi cr7, r5, 0 - bne cr7, L(process_unaligned_bytes) - - cmpb r8, r6, r8 /* compare s1 and s2 */ - cmpdi cr7, r8, -1 - bne cr7, L(process_unaligned_bytes) - - addi r5, r3, 8 /* increment s1 and d2 here */ - addi r11, r9, 40 - - ld r8, 40(r9) /* process s2 now */ - cmpb r9, r8, r10 - cmpdi cr7, r9, 0 - bne cr7, L(processBytes) - - mr r9, r7 - ld r7, 8(r3) /* process s1 now */ - cmpb r6, r7, r10 - cmpdi cr7, r6, 0 - beq cr7, L(unrollDword) /* unroll to compare s1 and s2 */ - -L(processBytes): - mr r4, r11 /* update input params */ - mr r3, r5 - - .p2align 4 + ld r8,0(r3) + ld r10,0(r4) + cmpb r7,r8,r5 + cmpdi cr7,r7,0 + mr r9,r7 + bne cr7,L(null_found) + cmpld cr7,r8,r10 + bne cr7,L(different) + + ld r8,8(r3) + ld r10,8(r4) + cmpb r7,r8,r5 + cmpdi cr7,r7,0 + mr r9,r7 + bne cr7,L(null_found) + cmpld cr7,r8,r10 + bne cr7,L(different) + + ld r8,16(r3) + ld r10,16(r4) + cmpb r7,r8,r5 + cmpdi cr7,r7,0 + mr r9,r7 + bne cr7,L(null_found) + cmpld cr7,r8,r10 + bne cr7,L(different) + + ld r8,24(r3) + ld r10,24(r4) + cmpb r7,r8,r5 + cmpdi cr7,r7,0 + mr r9,r7 + bne cr7,L(null_found) + cmpld cr7,r8,r10 + bne cr7,L(different) + + addi r3, r3, 32 + addi r4, r4, 32 + beq cr7, L(unrollDword) + + .align 4 +L(null_found): +#ifdef __LITTLE_ENDIAN__ + neg r7,r9 + and r9,r9,r7 + li r7,-1 + cntlzd r9,r9 + subfic r9,r9,71 + sld r9,r7,r9 +#else + cntlzd r9,r9 + li r7,-1 + addi r9,r9,8 + srd r9,r7,r9 +#endif + or r8,r8,r9 + or r10,r10,r9 + +L(different): + cmpb r9,r8,r10 +#ifdef __LITTLE_ENDIAN__ + addi r7,r9,1 + andc r9,r7,r9 + cntlzd r9,r9 + subfic r9,r9,63 +#else + not r9,r9 + cntlzd r9,r9 + subfic r9,r9,56 +#endif + srd r3,r8,r9 + srd r10,r10,r9 + rldicl r10,r10,0,56 + rldicl r3,r3,0,56 + subf r3,r10,r3 + blr + + .align 4 L(process_unaligned_bytes): lbz r9, 0(r3) /* load byte from s1 */ lbz r10, 0(r4) /* load byte from s2 */ @@ -172,24 +146,19 @@ L(process_unaligned_bytes): addi r4, r4, 4 /* increment s2 by unroll factor */ beq cr6, L(process_unaligned_bytes) /* unroll byte processing */ - .p2align 4 + .align 4 L(ComputeDiff): extsw r9, r9 subf r10, r10, r9 /* compute s1 - s2 */ extsw r3, r10 blr /* return */ - .p2align 4 + .align 4 L(diffOfNULL): li r9, 0 subf r10, r10, r9 /* compute s1 - s2 */ extsw r3, r10 /* sign extend result */ blr /* return */ - .p2align 4 -L(update2processBytes): - mr r3, r5 /* update and proceed */ - b L(process_unaligned_bytes) - END (strcmp) libc_hidden_builtin_def (strcmp) diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S index ce71982eaf..115f98a304 100644 --- a/sysdeps/powerpc/powerpc64/power7/strcpy.S +++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S @@ -31,8 +31,6 @@ if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0)) goto aligned_doubleword_copy; - if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0)) - goto aligned_word_copy; if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL)) goto same_alignment; goto unaligned; @@ -70,9 +68,18 @@ EALIGN (FUNC_NAME, 4, 0) #endif or rTMP, rSRC, rRTN clrldi. rTMP, rTMP, 61 - bne L(check_word_alignment) + bne L(check_alignment) b L(aligned_doubleword_copy) + .align 4 +L(check_alignment): + rldicl rRTNAL, rRTN, 0, 61 + rldicl rSRCAL, rSRC, 0, 61 + cmpld cr7, rSRCAL, rRTNAL + beq cr7, L(same_alignment) + b L(unaligned) + + .align 4 L(same_alignment): /* Src and dst with same alignment: align both to doubleword. */ mr rALCNT, rRTN @@ -180,93 +187,249 @@ L(g1): #endif blr -L(check_word_alignment): - clrldi. rTMP, rTMP, 62 - beq L(aligned_word_copy) - rldicl rRTNAL, rRTN, 0, 61 - rldicl rSRCAL, rSRC, 0, 61 - cmpld cr7, rSRCAL, rRTNAL - beq cr7, L(same_alignment) - b L(unaligned) - -/* For word aligned memory, operate using word load and stores. */ .align 4 -L(aligned_word_copy): - li rMASK, 0 - addi rRTN, rRTN, -4 - lwz rWORD, 0(rSRC) - b L(g5) +L(unaligned): + cmpdi rSRCAL, 0 /* Check src alignment */ + beq L(srcaligndstunalign) + /* src is unaligned */ + rlwinm r10, rSRC, 3,26,28 /* Calculate padding. */ + clrrdi rSRC, rSRC, 3 /* Align the addr to dw boundary */ + ld rWORD, 0(rSRC) /* Load doubleword from memory. */ + li rTMP, 0 + /* Discard bits not part of the string */ +#ifdef __LITTLE_ENDIAN__ + srd rALT, rWORD, r10 +#else + sld rALT, rWORD, r10 +#endif + cmpb rTMP, rALT, rTMP /* Compare each byte against null */ + /* Discard bits not part of the string */ +#ifdef __LITTLE_ENDIAN__ + sld rTMP, rTMP, r10 +#else + srd rTMP, rTMP, r10 +#endif + cmpdi rTMP, 0 + bne L(bytebybyte) /* if it has null, copy byte by byte */ + subfic r8, r9, 8 + rlwinm r5, rRTN, 3,26,28 /* Calculate padding in bits. */ + rldicl r9, rRTN, 0, 61 /* Calculate padding in bytes. */ + addi rRTN, rRTN, -1 - .align 4 -L(g3): lwzu rALT, 4(rSRC) - stwu rWORD, 4(rRTN) - cmpb rTMP, rALT, rMASK - cmpwi rTMP, 0 - bne L(g4) - lwzu rWORD, 4(rSRC) - stwu rALT, 4(rRTN) -L(g5): cmpb rTMP, rWORD, rMASK - cmpwi rTMP, 0 /* If rTMP is 0, no null in word. */ - beq L(g3) - - mr rALT, rWORD -/* We've hit the end of the string. Do the rest byte-by-byte. */ -L(g4): + cmpdi r5, 0 /* check dest alignment */ + beq L(srcunaligndstalign) + + /* both src and dst unaligned */ #ifdef __LITTLE_ENDIAN__ - rlwinm. rTMP, rALT, 0, 24, 31 - stbu rALT, 4(rRTN) - beqlr- - rlwinm. rTMP, rALT, 24, 24, 31 - stbu rTMP, 1(rRTN) - beqlr- - rlwinm. rTMP, rALT, 16, 24, 31 - stbu rTMP, 1(rRTN) - beqlr- - rlwinm rTMP, rALT, 8, 24, 31 - stbu rTMP, 1(rRTN) + sld rWORD, rALT, r10 + mr r11, r10 + addi r11, r11, -8 /* Adjust byte pointer on loaded dw */ #else - rlwinm. rTMP, rALT, 8, 24, 31 - stbu rTMP, 4(rRTN) - beqlr - rlwinm. rTMP, rALT, 16, 24, 31 - stbu rTMP, 1(rRTN) - beqlr - rlwinm. rTMP, rALT, 24, 24, 31 - stbu rTMP, 1(rRTN) - beqlr - stbu rALT, 1(rRTN) + srd rWORD, rALT, r10 + subfic r11, r10, 64 #endif - blr + /* dst alignment is greater then src alignment? */ + cmpd cr7, r5, r10 + blt cr7, L(dst_align_small) + /* src alignment is less than dst */ -/* Oh well. In this case, we just do a byte-by-byte copy. */ - .align 4 -L(unaligned): - lbz rWORD, 0(rSRC) - addi rRTN, rRTN, -1 - cmpdi rWORD, 0 - beq L(u2) - - .align 5 -L(u0): lbzu rALT, 1(rSRC) - stbu rWORD, 1(rRTN) - cmpdi rALT, 0 - beq L(u1) - lbzu rWORD, 1(rSRC) + /* Calculate the dst alignment differnce */ + subfic rALT, r9, 8 + mtctr rALT + + /* Write till dst is aligned */ + cmpdi rTMP, rALT, 4 + blt L(storebyte1) /* less than 4, store byte by byte */ + beq L(equal1) /* if its 4, store word */ + addi rTMP, rALT, -4 /* greater than 4, so stb and stw */ + mtctr rTMP +L(storebyte1): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on loaded dw */ +#else + addi r11, r11, -8 +#endif + srd rALT, rWORD, r11 + stbu rALT, 1(rRTN) + bdnz L(storebyte1) + + subfic rALT, r9, 8 /* Check the remaining bytes */ + cmpdi rTMP, rALT, 4 + blt L(proceed) + + .align 4 +L(equal1): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on loaded dw */ + srd rALT, rWORD, r11 +#else + subfic r11, r11, 64 + sld rALT, rWORD, r11 + srdi rALT, rALT, 32 +#endif + stw rALT, 1(rRTN) + addi rRTN, rRTN, 4 + +L(proceed): + mr rALT, rWORD + /* calculate the Left over bytes to be written */ + subfic r11, r10, 64 + subfic r5, r5, 64 + subf r5, r5, r11 /* remaining bytes on second dw */ + subfic r10, r5, 64 /* remaining bytes on first dw */ + subfic r9, r9, 8 + subf r8, r9, r8 /* recalculate padding */ +L(srcunaligndstalign): + addi rRTN, rRTN, 1 + subfic r5, r10, 64 /* remaining bytes on second dw */ + addi rSRC, rSRC, 8 + li rTMP,0 + b L(storedouble) + + .align 4 +L(dst_align_small): + mtctr r8 + /* Write till src is aligned */ +L(storebyte2): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on dw */ +#else + addi r11, r11, -8 +#endif + srd rALT, rWORD, r11 stbu rALT, 1(rRTN) - cmpdi rWORD, 0 - beq L(u2) - lbzu rALT, 1(rSRC) - stbu rWORD, 1(rRTN) - cmpdi rALT, 0 - beq L(u1) - lbzu rWORD, 1(rSRC) + bdnz L(storebyte2) + + addi rSRC, rSRC, 8 /* Increment src pointer */ + addi rRTN, rRTN, 1 /* Increment dst pointer */ + rldicl r8, rRTN, 0, 61 /* Recalculate padding */ + + /* src is aligned */ +L(srcaligndstunalign): + ld rWORD, 0(rSRC) + mr rALT, rWORD + li rTMP, 0 /* Check null */ + cmpb rTMP, rWORD, rTMP + cmpdi rTMP, 0 + bne L(bytebybyte) /* Do byte by byte if there is NULL */ + rlwinm r5, rRTN, 3,26,28 /* Calculate padding */ + addi rRTN, rRTN, -1 + subfic r10, r8, 8 + /* write byte by byte till aligned */ +#ifdef __LITTLE_ENDIAN__ + li r11, -8 +#else + li r11, 64 +#endif + mtctr r10 + cmpdi rTMP, r10, 4 + blt L(storebyte) + beq L(equal) + addi rTMP, r10, -4 + mtctr rTMP +L(storebyte): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 /* Adjust byte pointer on dw */ +#else + addi r11, r11, -8 +#endif + srd rALT, rWORD, r11 stbu rALT, 1(rRTN) - cmpdi rWORD, 0 - bne L(u0) -L(u2): stbu rWORD, 1(rRTN) - blr -L(u1): stbu rALT, 1(rRTN) - blr + bdnz L(storebyte) + + cmpdi rTMP, r10, 4 + blt L(align) + + .align 4 +L(equal): +#ifdef __LITTLE_ENDIAN__ + addi r11, r11, 8 + srd rALT, rWORD, r11 +#else + subfic r11, r11, 64 + sld rALT, rWORD, r11 + srdi rALT, rALT, 32 +#endif + stw rALT, 1(rRTN) + addi rRTN, rRTN, 4 +L(align): + addi rRTN, rRTN, 1 + addi rSRC, rSRC, 8 /* Increment src pointer */ + subfic r10, r5, 64 + li rTMP, 0 + /* dst addr aligned to 8 */ +L(storedouble): + ld rALT, 0(rSRC) /* load next dw */ + cmpb rTMP, rALT, rTMP + cmpdi rTMP, 0 /* check for null on each new dw */ + bne L(null) +#ifdef __LITTLE_ENDIAN__ + srd r9, rWORD, r10 /* bytes from first dw */ + sld r11, rALT, r5 /* bytes from second dw */ +#else + sld r9, rWORD, r10 + srd r11, rALT, r5 +#endif + or r11, r9, r11 /* make as a single dw */ + std r11, 0(rRTN) /* store as std on aligned addr */ + mr rWORD, rALT /* still few bytes left to be written */ + addi rRTN, rRTN, 8 /* increment dst addr */ + addi rSRC, rSRC, 8 /* increment src addr */ + b L(storedouble) /* Loop till NULL */ + + .align 4 + +/* We've hit the end of the string. Do the rest byte-by-byte. */ +L(null): + addi rRTN, rRTN, -1 + mr r10, r5 + mtctr r8 +#ifdef __LITTLE_ENDIAN__ + subfic r10, r10, 64 + addi r10, r10, -8 +#endif + cmpdi rTMP, r8, 4 + blt L(loop) + + /* we can still use stw if leftover >= 4*/ +#ifdef __LITTLE_ENDIAN__ + addi r10, r10, 8 + srd r11, rWORD, r10 +#else + subfic r10, r10, 64 + sld r11, rWORD, r10 + srdi r11, r11, 32 +#endif + stw r11, 1(rRTN) + addi rRTN, rRTN, 4 + + beq L(bytebybyte1) + addi r10, r10, 32 +#ifdef __LITTLE_ENDIAN__ + addi r10, r10, -8 +#else + subfic r10, r10, 64 +#endif + addi rTMP, r8, -4 + mtctr rTMP + /* remaining byte by byte part of first dw */ +L(loop): +#ifdef __LITTLE_ENDIAN__ + addi r10, r10, 8 +#else + addi r10, r10, -8 +#endif + srd rTMP, rWORD, r10 + stbu rTMP, 1(rRTN) + bdnz L(loop) + +L(bytebybyte1): + addi rRTN, rRTN, 1 + /* remaining byte by byte part of second dw */ +L(bytebybyte): + addi rRTN, rRTN, -8 + b L(g1) + END (FUNC_NAME) #ifndef USE_AS_STPCPY diff --git a/sysdeps/powerpc/powerpc64/power7/strncat.S b/sysdeps/powerpc/powerpc64/power7/strncat.S deleted file mode 100644 index f5ea52d3d4..0000000000 --- a/sysdeps/powerpc/powerpc64/power7/strncat.S +++ /dev/null @@ -1,228 +0,0 @@ -/* Optimized strncat implementation for PowerPC64/POWER7. - - Copyright (C) 2014 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -/* The algorithm is as follows for aligned memory access : - - if address of s2 is divisible by 0x7UL, - perform aligned doubleword catenation - else - perform unaligned catenation - - The aligned comparison are made using cmpb instructions. */ - -/* char* [r3] strncat (const char *s1 [r3], - const char *s2 [r4], - size_t size [r5]) */ - -#include <sysdep.h> - -#ifndef STRNCAT -# undef strncat -# define STRNCAT strncat -#endif - -#ifndef STRLEN -/* For builds with no IFUNC support, local calls should be made to internal - GLIBC symbol (created by libc_hidden_builtin_def). */ -# ifdef SHARED -# define STRLEN __GI_strlen -# else -# define STRLEN strlen -# endif -#endif - -#define FRAMESIZE (FRAME_MIN_SIZE+32) - - .machine power7 -EALIGN(STRNCAT, 4, 0) - CALL_MCOUNT 3 - - mflr r0 /* Load link register LR to r0. */ - -/* We shall use r29, r30 and r31 non volatile register for retention. - Save all the callee registers in the GPR save area. */ - std r29, -24(r1) /* Save callers register r29. */ - std r30, -16(r1) /* Save callers register r30. */ - std r31, -8(r1) /* Save callers register r31. */ - - std r0, 16(r1) /* Store the link register. */ - stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */ - -/* Improve performance with CPU pre-fetch. */ - dcbt 0, r3 /* Pre-fetch str to avoid cache - miss. */ - dcbt 0, r4 /* Pre-fetch accept to avoid cache - miss. */ - - mr. r29, r5 /* Save "n" in r29. */ - mr r30, r3 /* Save "s1" in r30 from r3. */ - beq cr0,L(done) - - mr r31, r4 /* Save "s2" in r31 from r4. */ - bl STRLEN /* Call optimized strlen on s1; goto - end of s1. */ - nop - cmpldi cr7, r29, 7 /* If s2 is <=7 process - byte-by-byte. */ - add r3, r30, r3 /* Grab the last character of s1. */ - bgt cr7,L(alignment) /* Process by aligned strings. */ - - cmpldi cr7, r29, 3 /* If n is >= 4, we can - byte-unroll. */ - addi r9, r3, -1 /* Make "s1" point before next - character, increment when read. */ - bgt cr7, L(bytes_unroll) /* Process each byte. */ - -L(byte_by_byte): - lbz r10, 0(r31) - addi r8, r9, 1 - cmpdi cr7, r10, 0 /* Check for NULL in "s2". */ - stb r10, 1(r9) - beq cr7, L(done) - add r9, r9, r29 - subf r9, r8, r9 - addi r9, r9, 1 - mtctr r9 - b L(branch2) - .p2align 4 -L(branch1): - lbzu r10, 1(r31) - cmpdi cr7, r10, 0 - stbu r10, 1(r8) - beq cr7,L(done) -L(branch2): - mr r9, r8 - bdnz L(branch1) - beq cr7,L(done) -L(nullTerminate): - li r10, 0 /* Load NULL for termination. */ - stb r10, 1(r9) /* Append or terminate s1 with - NULL. */ - .p2align 4 /* A small section here. */ -L(done): /* We return now. */ - addi r1, r1, FRAMESIZE /* Restore stack pointer. */ - mr r3, r30 /* Set the return value length of - string. */ - ld r0, 16(r1) /* Read the saved link register. */ - ld r29, -24(r1) /* Restore save register r29. */ - ld r30, -16(r1) /* Restore save register r30. */ - ld r31, -8(r1) /* Restore save register r31. */ - mtlr r0 /* Restore link register. */ - blr /* Branch to link register. */ - - .p2align 4 -L(alignment): - rldicl. r9, r31, 0, 61 /* Check if s2 is 8byte aligned */ - beq cr0,L(dwordAligned) - - .p2align 4 -/* Unaligned bytes in string, so process byte by byte. - POWER7 has performance gains over loop unroll. */ -L(bytes_unroll): - addi r9, r3, -1 - srdi r10, r29, 2 - mtctr r10 - b L(L10) - .p2align 4 -L(L44): - lbz r10, 1(r31) /* Load byte. */ - cmpdi cr7, r10, 0 /* Compare ; if byte not zero, - continue. */ - stb r10, 2(r9) /* Store byte */ - beq cr7, L(done) - addi r31, r31, 4 - - lbz r10, -2(r31) /* Perform loop unroll here on byte - load and store. */ - cmpdi cr7, r10, 0 - stb r10, 3(r9) - beq cr7, L(done) - - lbz r10, -1(r31) /* Loop unroll here. */ - cmpdi cr7, r10, 0 - stbu r10, 4(r9) - beq cr7, L(done) - - bdz L(leftNbytes) - -L(L10): - lbz r10, 0(r31) /* Loop unroll here. */ - cmpdi cr7, r10, 0 - stb r10, 1(r9) - bne cr7,L(L44) - b L(done) - .p2align 4 -/* If s2 is double word aligned, we load and store double word. */ -L(dwordAligned): -/* read, write 8 bytes at a time */ - srdi r8, r29, 3 /* Compute count for CTR to loop; - count = n/8. */ - li r7, 0 /* Load r7 with NULL. */ - li r10, 0 /* Load r10 with MASK '0'. */ - - mtctr r8 /* Move count to CTR. */ -L(loop8): - ld r9, 0(r31) /* Read double word from s2. */ - cmpb r6, r9, r10 /* Compare bytes in s2 we read - just now. */ - cmpdi r6, 0 /* If cmpb returned NULL, - we continue. */ - bne+ L(a8) - std r9, 0(r3) /* Append double word from s2 - with s1. */ - addi r3, r3, 8 /* Increment s1. */ - addi r31, r31, 8 /* Increment s2. */ - subi r29, r29, 8 /* Decrement count by 8. */ - bdnz L(loop8) /* Continue until "count" is - non zero. */ - -L(a8): - cmpdi r29, 0 /* If "n" is already zero, we skip. */ - beq+ L(align8align) - - mtctr r29 /* Process left over bytes in "n". */ -L(unaligned0): - lbz r9, 0(r31) /* Read a byte from s2. */ - cmpw r9, r7 /* If byte is NULL, we stop here . */ - beq+ L(align8align) /* Skip processing further if NULL. */ - stb r9, 0(r3) /* If not NULL, store byte into s1. */ - addi r3, r3, 1 /* Increment s1 by 1. */ - addi r31, r31, 1 /* Increment s2 by 1. */ - bdnz L(unaligned0) /* Decrement counter "n" and loop - until non zero. */ -L(align8align): - stb r7, 0(r3) /* Terminate s1 with NULL. */ - - addi r1, r1, FRAMESIZE /* Restore stack pointer. */ - mr r3, r30 /* Set the return value, length of - string. */ - ld r0, 16(r1) /* Read the saved link register. */ - ld r29, -24(r1) /* Restore save register r29. */ - ld r30, -16(r1) /* Restore save register r30. */ - ld r31, -8(r1) /* Restore save register r31. */ - mtlr r0 /* Restore link register. */ - blr /* Branch to link register */ - - .p2align 4 -L(leftNbytes): - rldicl. r29, r29, 0, 62 /* Check if n>0 and n < 4 bytes. */ - bne cr0,L(byte_by_byte) /* Process bytes one by one. */ - b L(nullTerminate) /* Now, finish catenation with - NULL termination. */ -END(STRNCAT) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S index 2b27e7b923..3e981265ab 100644 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S @@ -17,14 +17,9 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> -#include <endian.h> #include <math_ldbl_opt.h> -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define MFVSRD_R3_V1 .byte 0x66,0x00,0x23,0x7c /* mfvsrd r3,vs1 */ -#else -#define MFVSRD_R3_V1 .byte 0x7c,0x23,0x00,0x66 /* mfvsrd r3,vs1 */ -#endif +#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ /* int [r3] __finite ([fp1] x) */ diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S index d09b7fcef9..125de3943d 100644 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S @@ -17,14 +17,9 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> -#include <endian.h> #include <math_ldbl_opt.h> -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define MFVSRD_R3_V1 .byte 0x66,0x00,0x23,0x7c /* mfvsrd r3,vs1 */ -#else -#define MFVSRD_R3_V1 .byte 0x7c,0x23,0x00,0x66 /* mfvsrd r3,vs1 */ -#endif +#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ /* int [r3] __isinf([fp1] x) */ diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S index cf119e5c98..2c7b2d1d9a 100644 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S @@ -17,14 +17,9 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> -#include <endian.h> #include <math_ldbl_opt.h> -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define MFVSRD_R3_V1 .byte 0x66,0x00,0x23,0x7c /* mfvsrd r3,vs1 */ -#else -#define MFVSRD_R3_V1 .byte 0x7c,0x23,0x00,0x66 /* mfvsrd r3,vs1 */ -#endif +#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ /* int [r3] __isnan([f1] x) */ diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S index 9a55d93875..ce48d4e52c 100644 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S @@ -17,14 +17,9 @@ <http://www.gnu.org/licenses/>. */ #include <sysdep.h> -#include <endian.h> #include <math_ldbl_opt.h> -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define MFVSRD_R3_V1 .byte 0x66,0x00,0x23,0x7c /* mfvsrd r3,vs1 */ -#else -#define MFVSRD_R3_V1 .byte 0x7c,0x23,0x00,0x66 /* mfvsrd r3,vs1 */ -#endif +#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ /* long long int[r3] __llrint (double x[fp1]) */ ENTRY (__llrint) diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S index f10c06a36c..17cf30eaf1 100644 --- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S +++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S @@ -20,11 +20,7 @@ #include <endian.h> #include <math_ldbl_opt.h> -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define MFVSRD_R3_V1 .byte 0x66,0x00,0x23,0x7c /* mfvsrd r3,vs1 */ -#else -#define MFVSRD_R3_V1 .byte 0x7c,0x23,0x00,0x66 /* mfvsrd r3,vs1 */ -#endif +#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */ /* long long [r3] llround (float x [fp1]) */ diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S new file mode 100644 index 0000000000..d7324dc54a --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/memset.S @@ -0,0 +1,451 @@ +/* Optimized memset implementation for PowerPC64/POWER8. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */ + +/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); + Returns 's'. */ + + .machine power8 +EALIGN (memset, 5, 0) + CALL_MCOUNT 3 + +L(_memset): + cmpldi cr7,r5,31 + neg r0,r3 + mr r10,r3 + + insrdi r4,r4,8,48 + insrdi r4,r4,16,32 /* Replicate byte to word. */ + ble cr7,L(write_LT_32) + + andi. r11,r10,15 /* Check alignment of DST. */ + insrdi r4,r4,32,0 /* Replicate word to double word. */ + + beq L(big_aligned) + + mtocrf 0x01,r0 + clrldi r0,r0,60 + + /* Get DST aligned to 16 bytes. */ +1: bf 31,2f + stb r4,0(r10) + addi r10,r10,1 + +2: bf 30,4f + sth r4,0(r10) + addi r10,r10,2 + +4: bf 29,8f + stw r4,0(r10) + addi r10,r10,4 + +8: bf 28,16f + std r4,0(r10) + addi r10,r10,8 + +16: subf r5,r0,r5 + + .align 4 +L(big_aligned): + /* For sizes larger than 255 two possible paths: + - if constant is '0', zero full cache lines with dcbz + - otherwise uses vector instructions. */ + cmpldi cr5,r5,255 + dcbtst 0,r10 + cmpldi cr6,r4,0 + crand 27,26,21 + bt 27,L(huge_dcbz) + bge cr5,L(huge_vector) + + + /* Size between 32 and 255 bytes with constant different than 0, use + doubleword store instruction to achieve best throughput. */ + srdi r8,r5,5 + clrldi r11,r5,59 + cmpldi cr6,r11,0 + cmpdi r8,0 + beq L(tail_bytes) + mtctr r8 + + /* Main aligned write loop, writes 32-bytes at a time. */ + .align 4 +L(big_loop): + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + addi r10,r10,32 + bdz L(tail_bytes) + + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + addi r10,10,32 + bdnz L(big_loop) + + b L(tail_bytes) + + /* Write remaining 1~31 bytes. */ + .align 4 +L(tail_bytes): + beqlr cr6 + + srdi r7,r11,4 + clrldi r8,r11,60 + mtocrf 0x01,r7 + + .align 4 + bf 31,8f + std r4,0(r10) + std r4,8(r10) + addi r10,r10,16 + + .align 4 +8: mtocrf 0x1,r8 + bf 28,4f + std r4,0(r10) + addi r10,r10,8 + + .align 4 +4: bf 29,2f + stw 4,0(10) + addi 10,10,4 + + .align 4 +2: bf 30,1f + sth 4,0(10) + addi 10,10,2 + + .align 4 +1: bflr 31 + stb 4,0(10) + blr + + /* Size larger than 255 bytes with constant different than 0, use + vector instruction to achieve best throughput. */ +L(huge_vector): + /* Replicate set byte to quadword in VMX register. */ + MTVSRD_V1_R4 + xxpermdi 32,v0,v1,0 + vspltb v2,v0,15 + + /* Main aligned write loop: 128 bytes at a time. */ + li r6,16 + li r7,32 + li r8,48 + mtocrf 0x02,r5 + srdi r12,r5,7 + cmpdi r12,0 + beq L(aligned_tail) + mtctr r12 + b L(aligned_128loop) + + .align 4 +L(aligned_128loop): + stvx v2,0,r10 + stvx v2,r10,r6 + stvx v2,r10,r7 + stvx v2,r10,r8 + addi r10,r10,64 + stvx v2,0,r10 + stvx v2,r10,r6 + stvx v2,r10,r7 + stvx v2,r10,r8 + addi r10,r10,64 + bdnz L(aligned_128loop) + + /* Write remaining 1~127 bytes. */ +L(aligned_tail): + mtocrf 0x01,r5 + bf 25,32f + stvx v2,0,r10 + stvx v2,r10,r6 + stvx v2,r10,r7 + stvx v2,r10,r8 + addi r10,r10,64 + +32: bf 26,16f + stvx v2,0,r10 + stvx v2,r10,r6 + addi r10,r10,32 + +16: bf 27,8f + stvx v2,0,r10 + addi r10,r10,16 + +8: bf 28,4f + std r4,0(r10) + addi r10,r10,8 + + /* Copies 4~7 bytes. */ +4: bf 29,L(tail2) + stw r4,0(r10) + bf 30,L(tail5) + sth r4,4(r10) + bflr 31 + stb r4,6(r10) + /* Return original DST pointer. */ + blr + + /* Special case when value is 0 and we have a long length to deal + with. Use dcbz to zero out a full cacheline of 128 bytes at a time. + Before using dcbz though, we need to get the destination 128-byte + aligned. */ + .align 4 +L(huge_dcbz): + andi. r11,r10,127 + neg r0,r10 + beq L(huge_dcbz_aligned) + + clrldi r0,r0,57 + subf r5,r0,r5 + srdi r0,r0,3 + mtocrf 0x01,r0 + + /* Write 1~128 bytes until DST is aligned to 128 bytes. */ +8: bf 28,4f + + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + std r4,32(r10) + std r4,40(r10) + std r4,48(r10) + std r4,56(r10) + addi r10,r10,64 + + .align 4 +4: bf 29,2f + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + addi r10,r10,32 + + .align 4 +2: bf 30,1f + std r4,0(r10) + std r4,8(r10) + addi r10,r10,16 + + .align 4 +1: bf 31,L(huge_dcbz_aligned) + std r4,0(r10) + addi r10,r10,8 + +L(huge_dcbz_aligned): + /* Setup dcbz unroll offsets and count numbers. */ + srdi r8,r5,9 + clrldi r11,r5,55 + cmpldi cr6,r11,0 + li r9,128 + cmpdi r8,0 + beq L(huge_tail) + li r7,256 + li r6,384 + mtctr r8 + + .align 4 +L(huge_loop): + /* Sets 512 bytes to zero in each iteration, the loop unrolling shows + a throughput boost for large sizes (2048 bytes or higher). */ + dcbz 0,r10 + dcbz r9,r10 + dcbz r7,r10 + dcbz r6,r10 + addi r10,r10,512 + bdnz L(huge_loop) + + beqlr cr6 + +L(huge_tail): + srdi r6,r11,8 + srdi r7,r11,4 + clrldi r8,r11,4 + cmpldi cr6,r8,0 + mtocrf 0x01,r6 + + beq cr6,L(tail) + + /* We have 1~511 bytes remaining. */ + .align 4 +32: bf 31,16f + dcbz 0,r10 + dcbz r9,r10 + addi r10,r10,256 + + .align 4 +16: mtocrf 0x01,r7 + bf 28,8f + dcbz 0,r10 + addi r10,r10,128 + + .align 4 +8: bf 29,4f + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + std r4,32(r10) + std r4,40(r10) + std r4,48(r10) + std r4,56(r10) + addi r10,r10,64 + + .align 4 +4: bf 30,2f + std r4,0(r10) + std r4,8(r10) + std r4,16(r10) + std r4,24(r10) + addi r10,r10,32 + + .align 4 +2: bf 31,L(tail) + std r4,0(r10) + std r4,8(r10) + addi r10,r10,16 + .align 4 + + /* Remaining 1~15 bytes. */ +L(tail): + mtocrf 0x01,r8 + + .align +8: bf 28,4f + std r4,0(r10) + addi r10,r10,8 + + .align 4 +4: bf 29,2f + stw r4,0(r10) + addi r10,r10,4 + + .align 4 +2: bf 30,1f + sth r4,0(r10) + addi r10,r10,2 + + .align 4 +1: bflr 31 + stb r4,0(r10) + blr + + /* Handle short copies of 0~31 bytes. Best throughput is achieved + by just unrolling all operations. */ + .align 4 +L(write_LT_32): + cmpldi cr6,5,8 + mtocrf 0x01,r5 + ble cr6,L(write_LE_8) + + /* At least 9 bytes to go. */ + neg r8,r4 + andi. r0,r8,3 + cmpldi cr1,r5,16 + beq L(write_LT_32_aligned) + + /* Force 4-byte alignment for SRC. */ + mtocrf 0x01,r0 + subf r5,r0,r5 + +2: bf 30,1f + sth r4,0(r10) + addi r10,r10,2 + +1: bf 31,L(end_4bytes_alignment) + stb r4,0(r10) + addi r10,r10,1 + + .align 4 +L(end_4bytes_alignment): + cmpldi cr1,r5,16 + mtocrf 0x01,r5 + +L(write_LT_32_aligned): + blt cr1,8f + + stw r4,0(r10) + stw r4,4(r10) + stw r4,8(r10) + stw r4,12(r10) + addi r10,r10,16 + +8: bf 28,L(tail4) + stw r4,0(r10) + stw r4,4(r10) + addi r10,r10,8 + + .align 4 + /* Copies 4~7 bytes. */ +L(tail4): + bf 29,L(tail2) + stw r4,0(r10) + bf 30,L(tail5) + sth r4,4(r10) + bflr 31 + stb r4,6(r10) + blr + + .align 4 + /* Copies 2~3 bytes. */ +L(tail2): + bf 30,1f + sth r4,0(r10) + bflr 31 + stb r4,2(r10) + blr + + .align 4 +L(tail5): + bflr 31 + stb r4,4(r10) + blr + + .align 4 +1: bflr 31 + stb r4,0(r10) + blr + + /* Handles copies of 0~8 bytes. */ + .align 4 +L(write_LE_8): + bne cr6,L(tail4) + + stw r4,0(r10) + stw r4,4(r10) + blr +END_GEN_TB (memset,TB_TOCLESS) +libc_hidden_builtin_def (memset) + +/* Copied from bzero.S to prevent the linker from inserting a stub + between bzero and memset. */ +ENTRY (__bzero) + CALL_MCOUNT 3 + mr r5,r4 + li r4,0 + b L(_memset) +END (__bzero) +#ifndef __bzero +weak_alias (__bzero, bzero) +#endif diff --git a/sysdeps/powerpc/powerpc64/power8/stpcpy.S b/sysdeps/powerpc/powerpc64/power8/stpcpy.S new file mode 100644 index 0000000000..bf72065114 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/stpcpy.S @@ -0,0 +1,24 @@ +/* Optimized stpcpy implementation for PowerPC64/POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define USE_AS_STPCPY +#include <sysdeps/powerpc/powerpc64/power8/strcpy.S> + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/sysdeps/powerpc/powerpc64/power8/stpncpy.S new file mode 100644 index 0000000000..76a146609f --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/stpncpy.S @@ -0,0 +1,20 @@ +/* Optimized stpncpy implementation for PowerPC64/POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#define USE_AS_STPNCPY +#include <sysdeps/powerpc/powerpc64/power8/strncpy.S> diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S new file mode 100644 index 0000000000..223d891c2e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S @@ -0,0 +1,257 @@ +/* Optimized strcmp implementation for PowerPC64/POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Implements the function + + size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment. Although recent powerpc64 uses + 64K as default, the page cross handling assumes minimum page size of + 4k. */ + +EALIGN (strcmp, 4, 0) + li r0,0 + + /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using + the code: + + (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) + + with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */ + + rldicl r7,r3,0,52 + rldicl r9,r4,0,52 + cmpldi cr7,r7,4096-32 + bgt cr7,L(pagecross_check) + cmpldi cr5,r9,4096-32 + bgt cr5,L(pagecross_check) + + /* For short string up to 32 bytes, load both s1 and s2 using + unaligned dwords and compare. */ + ld r8,0(r3) + ld r10,0(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ld r8,8(r3) + ld r10,8(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ld r8,16(r3) + ld r10,16(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ld r8,24(r3) + ld r10,24(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + addi r7,r3,32 + addi r4,r4,32 + +L(align_8b): + /* Now it has checked for first 32 bytes, align source1 to doubleword + and adjust source2 address. */ + rldicl r9,r7,0,61 /* source1 alignment to doubleword */ + subf r4,r9,r4 /* Adjust source2 address based on source1 + alignment. */ + rldicr r7,r7,0,60 /* Align source1 to doubleword. */ + + /* At this point, source1 alignment is 0 and source2 alignment is + between 0 and 7. Check is source2 alignment is 0, meaning both + sources have the same alignment. */ + andi. r9,r4,0x7 + bne cr0,L(loop_diff_align) + + /* If both source1 and source2 are doubleword aligned, there is no + need for page boundary cross checks. */ + + ld r8,0(r7) + ld r10,0(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + .align 4 +L(loop_equal_align): + ld r8,8(r7) + ld r10,8(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ld r8,16(r7) + ld r10,16(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + ldu r8,24(r7) + ldu r10,24(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + + b L(loop_equal_align) + + /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb + result and r10 the dword from s2. To code isolate the byte + up to end (including the '\0'), masking with 0xFF the remaining + ones: + + #if __LITTLE_ENDIAN__ + (__builtin_ffsl (x) - 1) = counting trailing zero bits + r9 = (__builtin_ffsl (r9) - 1) + 8; + r9 = -1UL << r9 + #else + r9 = __builtin_clzl (r9) + 8; + r9 = -1UL >> r9 + #endif + r8 = r8 | r9 + r10 = r10 | r9 */ + +#ifdef __LITTLE_ENDIAN__ + nor r9,r9,r9 +L(different_nocmpb): + neg r3,r9 + and r9,r9,r3 + cntlzd r9,r9 + subfic r9,r9,63 +#else + not r9,r9 +L(different_nocmpb): + cntlzd r9,r9 + subfic r9,r9,56 +#endif + srd r3,r8,r9 + srd r10,r10,r9 + rldicl r10,r10,0,56 + rldicl r3,r3,0,56 + subf r3,r10,r3 + extsw r3,r3 + blr + + .align 4 +L(pagecross_check): + subfic r9,r9,4096 + subfic r7,r7,4096 + cmpld cr7,r7,r9 + bge cr7,L(pagecross) + mr r7,r9 + + /* If unaligned 16 bytes reads across a 4K page boundary, it uses + a simple byte a byte comparison until the page alignment for s1 + is reached. */ +L(pagecross): + add r7,r3,r7 + subf r9,r3,r7 + mtctr r9 + + .align 4 +L(pagecross_loop): + /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2 + and if *s1 is '\0'. */ + lbz r9,0(r3) + lbz r10,0(r4) + addi r3,r3,1 + addi r4,r4,1 + cmplw cr7,r9,r10 + cmpdi cr5,r9,r0 + bne cr7,L(pagecross_ne) + beq cr5,L(pagecross_nullfound) + bdnz L(pagecross_loop) + b L(align_8b) + + .align 4 + /* The unaligned read of source2 will cross a 4K page boundary, + and the different byte or NULL maybe be in the remaining page + bytes. Since it can not use the unaligned load, the algorithm + reads and compares 8 bytes to keep source1 doubleword aligned. */ +L(check_source2_byte): + li r9,8 + mtctr r9 + + .align 4 +L(check_source2_byte_loop): + lbz r9,0(r7) + lbz r10,0(r4) + addi r7,r7,1 + addi r4,r4,1 + cmplw cr7,r9,10 + cmpdi r5,r9,0 + bne cr7,L(pagecross_ne) + beq cr5,L(pagecross_nullfound) + bdnz L(check_source2_byte_loop) + + /* If source2 is unaligned to doubleword, the code needs to check + on each interation if the unaligned doubleword access will cross + a 4k page boundary. */ + .align 5 +L(loop_unaligned): + ld r8,0(r7) + ld r10,0(r4) + cmpb r12,r8,r0 + cmpb r11,r8,r10 + orc. r9,r12,r11 + bne cr0,L(different_nocmpb) + addi r7,r7,8 + addi r4,r4,8 + +L(loop_diff_align): + /* Check if [src2]+8 cross a 4k page boundary: + + srcin2 % PAGE_SIZE > (PAGE_SIZE - 8) + + with PAGE_SIZE being 4096. */ + rldicl r9,r4,0,52 + cmpldi cr7,r9,4088 + ble cr7,L(loop_unaligned) + b L(check_source2_byte) + + .align 4 +L(pagecross_ne): + extsw r3,r9 + mr r9,r10 +L(pagecross_retdiff): + subf r9,r9,r3 + extsw r3,r9 + blr + + .align 4 +L(pagecross_nullfound): + li r3,0 + b L(pagecross_retdiff) +END (strcmp) +libc_hidden_builtin_def (strcmp) diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S new file mode 100644 index 0000000000..d3e9a101c5 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S @@ -0,0 +1,262 @@ +/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifdef USE_AS_STPCPY +# define FUNC_NAME __stpcpy +#else +# define FUNC_NAME strcpy +#endif + +/* Implements the function + + char * [r3] strcpy (char *dest [r3], const char *src [r4]) + + or + + char * [r3] stpcpy (char *dest [r3], const char *src [r4]) + + if USE_AS_STPCPY is defined. + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment. Although recent powerpc64 uses + 64K as default, the page cross handling assumes minimum page size of + 4k. */ + + .machine power7 +EALIGN (FUNC_NAME, 4, 0) + li r0,0 /* Doubleword with null chars to use + with cmpb. */ + + /* Check if the [src]+15 will cross a 4K page by checking if the bit + indicating the page size changes. Basically: + + uint64_t srcin = (uint64_t)src; + uint64_t ob = srcin & 4096UL; + uint64_t nb = (srcin+15UL) & 4096UL; + if (ob ^ nb) + goto pagecross; */ + + addi r9,r4,15 + xor r9,r9,r4 + rlwinm. r9,r9,0,19,19 + bne L(pagecross) + + /* For short string (less than 16 bytes), just calculate its size as + strlen and issues a memcpy if null is found. */ + mr r7,r4 + ld r12,0(r7) /* Load doubleword from memory. */ + cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */ + cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ + bne cr7,L(done) + + ldu r8,8(r7) + cmpb r10,r8,r0 + cmpdi cr7,r10,0 + bne cr7,L(done) + + b L(loop_before) + + .align 4 +L(pagecross): + clrrdi r7,r4,3 /* Align the address to doubleword boundary. */ + rlwinm r6,r4,3,26,28 /* Calculate padding. */ + li r5,-1 /* MASK = 0xffffffffffffffff. */ + ld r12,0(r7) /* Load doubleword from memory. */ +#ifdef __LITTLE_ENDIAN__ + sld r5,r5,r6 +#else + srd r5,r5,r6 /* MASK = MASK >> padding. */ +#endif + orc r9,r12,r5 /* Mask bits that are not part of the string. */ + cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */ + cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */ + bne cr7,L(done) + + ldu r6,8(r7) + cmpb r10,r6,r0 + cmpdi cr7,r10,0 + bne cr7,L(done) + + ld r12,0(r7) + cmpb r10,r12,r0 + cmpdi cr7,r10,0 + bne cr7,L(done) + + ldu r6,8(r7) + cmpb r10,r6,r0 + cmpdi cr7,r10,0 + bne cr7,L(done) + + /* We checked for 24 - x bytes, with x being the source alignment + (0 <= x <= 16), and no zero has been found. Start the loop + copy with doubleword aligned address. */ + mr r7,r4 + ld r12, 0(r7) + ldu r8, 8(r7) + +L(loop_before): + /* Save the two doublewords readed from source and align the source + to 16 bytes for the loop. */ + mr r11,r3 + std r12,0(r11) + std r8,8(r11) + addi r11,r11,16 + rldicl r9,r4,0,60 + subf r7,r9,r7 + subf r11,r9,r11 + b L(loop_start) + + .align 5 +L(loop): + std r12, 0(r11) + std r6, 8(r11) + addi r11,r11,16 +L(loop_start): + /* Load two doublewords, compare and merge in a + single register for speed. This is an attempt + to speed up the null-checking process for bigger strings. */ + + ld r12, 8(r7) + ldu r6, 16(r7) + cmpb r10,r12,r0 + cmpb r9,r6,r0 + or r8,r9,r10 /* Merge everything in one doubleword. */ + cmpdi cr7,r8,0 + beq cr7,L(loop) + + + /* OK, one (or both) of the doublewords contains a null byte. Check + the first doubleword and decrement the address in case the first + doubleword really contains a null byte. */ + + addi r4,r7,-8 + cmpdi cr6,r10,0 + addi r7,r7,-8 + bne cr6,L(done2) + + /* The null byte must be in the second doubleword. Adjust the address + again and move the result of cmpb to r10 so we can calculate the + length. */ + + mr r10,r9 + addi r7,r7,8 + b L(done2) + + /* r10 has the output of the cmpb instruction, that is, it contains + 0xff in the same position as the null byte in the original + doubleword from the string. Use that to calculate the length. */ +L(done): + mr r11,r3 +L(done2): +#ifdef __LITTLE_ENDIAN__ + addi r9, r10, -1 /* Form a mask from trailing zeros. */ + andc r9, r9, r10 + popcntd r6, r9 /* Count the bits in the mask. */ +#else + cntlzd r6,r10 /* Count leading zeros before the match. */ +#endif + subf r5,r4,r7 + srdi r6,r6,3 /* Convert leading/trailing zeros to bytes. */ + add r8,r5,r6 /* Compute final length. */ +#ifdef USE_AS_STPCPY + /* stpcpy returns the dest address plus the size not counting the + final '\0'. */ + add r3,r11,r8 +#endif + addi r8,r8,1 /* Final '/0'. */ + + cmpldi cr6,r8,8 + mtocrf 0x01,r8 + ble cr6,L(copy_LE_8) + + cmpldi cr1,r8,16 + blt cr1,8f + + /* Handle copies of 0~31 bytes. */ + .align 4 +L(copy_LT_32): + /* At least 6 bytes to go. */ + blt cr1,8f + + /* Copy 16 bytes. */ + ld r6,0(r4) + ld r8,8(r4) + addi r4,r4,16 + std r6,0(r11) + std r8,8(r11) + addi r11,r11,16 +8: /* Copy 8 bytes. */ + bf 28,L(tail4) + ld r6,0(r4) + addi r4,r4,8 + std r6,0(r11) + addi r11,r11,8 + + .align 4 +/* Copies 4~7 bytes. */ +L(tail4): + bf 29,L(tail2) + lwz r6,0(r4) + stw r6,0(r11) + bf 30,L(tail5) + lhz r7,4(r4) + sth r7,4(r11) + bflr 31 + lbz r8,6(r4) + stb r8,6(r11) + blr + + .align 4 +/* Copies 2~3 bytes. */ +L(tail2): + bf 30,1f + lhz r6,0(r4) + sth r6,0(r11) + bflr 31 + lbz r7,2(r4) + stb r7,2(r11) + blr + + .align 4 +L(tail5): + bf 31,1f + lbz r6,4(r4) + stb r6,4(r11) + blr + + .align 4 +1: + bflr 31 + lbz r6,0(r4) + stb r6,0(r11) + blr + +/* Handles copies of 0~8 bytes. */ + .align 4 +L(copy_LE_8): + bne cr6,L(tail4) + ld r6,0(r4) + std r6,0(r11) + blr +END (FUNC_NAME) + +#ifndef USE_AS_STPCPY +libc_hidden_builtin_def (strcpy) +#endif diff --git a/sysdeps/powerpc/powerpc64/power8/strncmp.S b/sysdeps/powerpc/powerpc64/power8/strncmp.S new file mode 100644 index 0000000000..56c814b88c --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strncmp.S @@ -0,0 +1,323 @@ +/* Optimized strncmp implementation for PowerPC64/POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* Implements the function + + int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n) + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment. Although recent powerpc64 uses + 64K as default, the page cross handling assumes minimum page size of + 4k. */ + + .machine power7 +EALIGN (strncmp, 4, 0) + /* Check if size is 0. */ + mr. r10,r5 + beq cr0,L(ret0) + + /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using + the code: + + (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE)) + + with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */ + rldicl r8,r3,0,52 + cmpldi cr7,r8,4096-16 + bgt cr7,L(pagecross) + rldicl r9,r4,0,52 + cmpldi cr7,r9,4096-16 + bgt cr7,L(pagecross) + + /* For short string up to 16 bytes, load both s1 and s2 using + unaligned dwords and compare. */ + ld r7,0(r3) + ld r9,0(r4) + li r8,0 + cmpb r8,r7,r8 + cmpb r6,r7,r9 + orc. r8,r8,r6 + bne cr0,L(different1) + + /* If the string compared are equal, but size is less or equal + to 8, return 0. */ + cmpldi cr7,r10,8 + li r9,0 + ble cr7,L(ret1) + addi r5,r10,-8 + + ld r7,8(r3) + ld r9,8(r4) + cmpb r8,r7,r8 + cmpb r6,r7,r9 + orc. r8,r8,r6 + bne cr0,L(different0) + + cmpldi cr7,r5,8 + mr r9,r8 + ble cr7,L(ret1) + + /* Update pointers and size. */ + addi r10,r10,-16 + addi r3,r3,16 + addi r4,r4,16 + + /* Now it has checked for first 16 bytes, align source1 to doubleword + and adjust source2 address. */ +L(align_8b): + rldicl r5,r3,0,61 + rldicr r3,r3,0,60 + subf r4,r5,r4 + add r10,r10,r5 + + /* At this point, source1 alignment is 0 and source2 alignment is + between 0 and 7. Check is source2 alignment is 0, meaning both + sources have the same alignment. */ + andi. r8,r4,0x7 + beq cr0,L(loop_eq_align_0) + + li r5,0 + b L(loop_ne_align_1) + + /* If source2 is unaligned to doubleword, the code needs to check + on each interation if the unaligned doubleword access will cross + a 4k page boundary. */ + .align 4 +L(loop_ne_align_0): + ld r7,0(r3) + ld r9,0(r4) + cmpb r8,r7,r5 + cmpb r6,r7,r9 + orc. r8,r8,r6 + bne cr0,L(different1) + + cmpldi cr7,r10,8 + ble cr7,L(ret0) + addi r10,r10,-8 + addi r3,r3,8 + addi r4,r4,8 +L(loop_ne_align_1): + rldicl r9,r4,0,52 + cmpldi r7,r9,4088 + ble cr7,L(loop_ne_align_0) + cmpdi cr7,r10,0 + beq cr7,L(ret0) + + lbz r9,0(r3) + lbz r8,0(r4) + cmplw cr7,r9,r8 + bne cr7,L(byte_ne_4) + cmpdi cr7,r9,0 + beq cr7,L(size_reached_0) + + li r9,r7 + addi r8,r3,1 + mtctr r9 + addi r4,r4,1 + addi r10,r10,-1 + addi r3,r3,8 + + /* The unaligned read of source2 will cross a 4K page boundary, + and the different byte or NULL maybe be in the remaining page + bytes. Since it can not use the unaligned load the algorithm + reads and compares 8 bytes to keep source1 doubleword aligned. */ + .align 4 +L(loop_ne_align_byte): + cmpdi cr7,r10,0 + addi r10,r10,-1 + beq cr7,L(ret0) + lbz r9,0(r8) + lbz r7,0(r4) + addi r8,r8,1 + addi r4,r4,1 + cmplw cr7,r9,r7 + cmpdi cr5,r9,0 + bne cr7,L(size_reached_2) + beq cr5,L(size_reached_0) + bdnz L(loop_ne_align_byte) + + cmpdi cr7,r10,0 + bne+ cr7,L(loop_ne_align_0) + + .align 4 +L(ret0): + li r9,0 +L(ret1): + mr r3,r9 + blr + + /* The code now check if r8 and r10 are different by issuing a + cmpb and shift the result based on its output: + + #ifdef __LITTLE_ENDIAN__ + leadzero = (__builtin_ffsl (z1) - 1); + leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero; + r1 = (r1 >> leadzero) & 0xFFUL; + r2 = (r2 >> leadzero) & 0xFFUL; + #else + leadzero = __builtin_clzl (z1); + leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero; + r1 = (r1 >> (56 - leadzero)) & 0xFFUL; + r2 = (r2 >> (56 - leadzero)) & 0xFFUL; + #endif + return r1 - r2; */ + + .align 4 +L(different0): + mr r10,r5 +#ifdef __LITTLE_ENDIAN__ +L(different1): + neg r11,r8 + sldi r10,r10,3 + and r8,r11,r8 + addi r10,r10,-8 + cntlzd r8,r8 + subfic r8,r8,63 + extsw r8,r8 + cmpld cr7,r8,r10 + ble cr7,L(different2) + mr r8,r10 +L(different2): + extsw r8,r8 +#else +L(different1): + addi r10,r10,-1 + cntlzd r8,r8 + sldi r10,r10,3 + cmpld cr7,r8,r10 + blt cr7,L(different2) + mr r8,r10 +L(different2): + subfic r8,r8,56 +#endif + srd r7,r7,r8 + srd r9,r9,r8 + rldicl r3,r7,0,56 + rldicl r9,r9,0,56 + subf r9,r9,3 + extsw r9,r9 + mr r3,r9 + blr + + /* If unaligned 16 bytes reads across a 4K page boundary, it uses + a simple byte a byte comparison until the page alignment for s1 + is reached. */ + .align 4 +L(pagecross): + lbz r7,0(r3) + lbz r9,0(r4) + subfic r8,r8,4095 + cmplw cr7,r9,r7 + bne cr7,L(byte_ne_3) + cmpdi cr7,r9,0 + beq cr7,L(byte_ne_0) + addi r10,r10,-1 + subf r7,r8,r10 + subf r9,r7,r10 + addi r9,r9,1 + mtctr r9 + b L(pagecross_loop1) + + .align 4 +L(pagecross_loop0): + beq cr7,L(ret0) + lbz r9,0(r3) + lbz r8,0(r4) + addi r10,r10,-1 + cmplw cr7,r9,r8 + cmpdi cr5,r9,0 + bne r7,L(byte_ne_2) + beq r5,L(byte_ne_0) +L(pagecross_loop1): + cmpdi cr7,r10,0 + addi r3,r3,1 + addi r4,r4,1 + bdnz L(pagecross_loop0) + cmpdi cr7,r7,0 + li r9,0 + bne+ cr7,L(align_8b) + b L(ret1) + + /* If both source1 and source2 are doubleword aligned, there is no + need for page boundary cross checks. */ + .align 4 +L(loop_eq_align_0): + ld r7,0(r3) + ld r9,0(r4) + cmpb r8,r7,r8 + cmpb r6,r7,r9 + orc. r8,r8,r6 + bne cr0,L(different1) + + cmpldi cr7,r10,8 + ble cr7,L(ret0) + addi r9,r10,-9 + + li r5,0 + srdi r9,r9,3 + addi r9,r9,1 + mtctr r9 + b L(loop_eq_align_2) + + .align 4 +L(loop_eq_align_1): + bdz L(ret0) +L(loop_eq_align_2): + ldu r7,8(r3) + addi r10,r10,-8 + ldu r9,8(r4) + cmpb r8,r7,r5 + cmpb r6,r7,r9 + orc. r8,r8,r6 + beq cr0,L(loop_eq_align_1) + b L(different1) + + .align 4 +L(byte_ne_0): + li r7,0 +L(byte_ne_1): + subf r9,r9,r7 + extsw r9,r9 + b L(ret1) + + .align 4 +L(byte_ne_2): + extsw r7,r9 + mr r9,r8 + b L(byte_ne_1) +L(size_reached_0): + li r10,0 +L(size_reached_1): + subf r9,r9,r10 + extsw r9,r9 + b L(ret1) +L(size_reached_2): + extsw r10,r9 + mr r9,r7 + b L(size_reached_1) +L(byte_ne_3): + extsw r7,r7 + b L(byte_ne_1) +L(byte_ne_4): + extsw r10,r9 + mr r9,r8 + b L(size_reached_1) +END(strncmp) +libc_hidden_builtin_def(strncmp) diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S new file mode 100644 index 0000000000..5fda953526 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S @@ -0,0 +1,424 @@ +/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8. + Copyright (C) 2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#ifdef USE_AS_STPNCPY +# define FUNC_NAME __stpncpy +#else +# define FUNC_NAME strncpy +#endif + +/* Implements the function + + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + or + + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5]) + + if USE_AS_STPCPY is defined. + + The implementation uses unaligned doubleword access to avoid specialized + code paths depending of data alignment. Although recent powerpc64 uses + 64K as default, the page cross handling assumes minimum page size of + 4k. */ + + .machine power7 +EALIGN (FUNC_NAME, 4, 0) + + /* Check if the [src]+15 will cross a 4K page by checking if the bit + indicating the page size changes. Basically: + + uint64_t srcin = (uint64_t)src; + uint64_t ob = srcin & 4096UL; + uint64_t nb = (srcin+15UL) & 4096UL; + if (ob ^ nb) + goto pagecross; */ + + addi r10,r4,16 + rlwinm r9,r4,0,19,19 + + /* Since it is a leaf function, save some non-volatile registers on the + protected/red zone. */ + std r26,-48(r1) + std r27,-40(r1) + + rlwinm r8,r10,0,19,19 + + std r28,-32(r1) + std r29,-24(r1) + + cmpld r7,r9,r8 + + std r30,-16(r1) + std r31,-8(r1) + + beq cr7,L(unaligned_lt_16) + rldicl r9,r4,0,61 + subfic r8,r9,8 + cmpld cr7,r5,r8 + bgt cr7,L(pagecross) + + /* At this points there is 1 to 15 bytes to check and write. Since it could + be either from first unaligned 16 bytes access or from bulk copy, the code + uses an unrolled byte read/write instead of trying to analyze the cmpb + results. */ +L(short_path): + mr r9,r3 +L(short_path_1): + cmpdi cr7,r5,0 + beq cr7,L(short_path_loop_end_1) +L(short_path_2): + lbz r10,0(r4) + cmpdi cr7,r10,0 + stb r10,0(r9) + beq cr7,L(zero_pad_start_1) + cmpdi cr0,r5,1 + addi r8,r9,1 + addi r6,r5,-1 + beq cr0,L(short_path_loop_end_0) + lbz r10,1(r4) + cmpdi cr7,r10,0 + stb r10,1(r9) + beq cr7,L(zero_pad_start_prepare_1) + addi r10,r5,-3 + b L(short_path_loop_1) + + .align 4 +L(short_path_loop): + lbz r8,0(r4) + addi r7,r10,-2 + cmpdi cr5,r8,0 + stb r8,0(r9) + beq cr5,L(zero_pad_start_1) + beq r7,L(short_path_loop_end_0) + lbz r8,1(r4) + cmpdi cr7,r8,0 + stb r8,1(r9) + beq cr7,L(zero_pad_start) + mr r10,r7 +L(short_path_loop_1): + addic. r5,r5,-2 + addi r9,r9,2 + cmpdi cr7,r10,0 + addi r4,r4,2 + addi r6,r9,1 + bne cr0,L(short_path_loop) +#ifdef USE_AS_STPNCPY + mr r3,r9 + b L(short_path_loop_end) +#endif + +L(short_path_loop_end_0): +#ifdef USE_AS_STPNCPY + addi r3,r9,1 + b L(short_path_loop_end) +#endif +L(short_path_loop_end_1): +#ifdef USE_AS_STPNCPY + mr r3,r9 +#endif +L(short_path_loop_end): + /* Restore non-volatile registers. */ + ld r26,-48(r1) + ld r27,-40(r1) + ld r28,-32(r1) + ld r29,-24(r1) + ld r30,-16(r1) + ld r31,-8(r1) + blr + + /* This code pads the remainder dest with NULL bytes. The algorithm + calculate the remanining size and issues a doubleword unrolled + loops followed by a byte a byte set. */ + .align 4 +L(zero_pad_start): + mr r5,r10 + mr r9,r6 +L(zero_pad_start_1): + srdi. r8,r5,r3 + mr r10,r9 +#ifdef USE_AS_STPNCPY + mr r3,r9 +#endif + beq- cr0,L(zero_pad_loop_b_start) + cmpldi cr7,r8,1 + li cr7,0 + std r7,0(r9) + beq cr7,L(zero_pad_loop_b_prepare) + addic. r8,r8,-2 + addi r10,r9,r16 + std r7,8(r9) + beq cr0,L(zero_pad_loop_dw_2) + std r7,16(r9) + li r9,0 + b L(zero_pad_loop_dw_1) + + .align 4 +L(zero_pad_loop_dw): + addi r10,r10,16 + std r9,-8(r10) + beq cr0,L(zero_pad_loop_dw_2) + std r9,0(r10) +L(zero_pad_loop_dw_1): + cmpldi cr7,r8,1 + std r9,0(r10) + addic. r8,r8,-2 + bne cr7,L(zero_pad_loop_dw) + addi r10,r10,8 +L(zero_pad_loop_dw_2): + rldicl r5,r5,0,61 +L(zero_pad_loop_b_start): + cmpdi cr7,r5,0 + addi r5,r5,-1 + addi r9,r10,-1 + add r10,r10,5 + subf r10,r9,r10 + li r8,0 + beq- cr7,L(short_path_loop_end) + + /* Write remaining 1-8 bytes. */ + .align 4 + addi r9,r9,1 + mtocrf 0x1,r10 + bf 29,4f + stw r8,0(r9) + addi r9,r9,4 + + .align 4 +4: bf 30,2f + sth r8,0(r9) + addi r9,r9,2 + + .align 4 +2: bf 31,1f + stb r8,0(r9) + + /* Restore non-volatile registers. */ +1: ld r26,-48(r1) + ld r27,-40(r1) + ld r28,-32(r1) + ld r29,-24(r1) + ld r30,-16(r1) + ld r31,-8(r1) + blr + + /* The common case where [src]+16 will not cross a 4K page boundary. + In this case the code fast check the first 16 bytes by using doubleword + read/compares and update destiny if neither total size or null byte + is found in destiny. */ + .align 4 +L(unaligned_lt_16): + cmpldi cr7,r5,7 + ble cr7,L(short_path) + ld r7,0(r4) + li r8,0 + cmpb r8,r7,r8 + cmpdi cr7,r8,0 + bne cr7,L(short_path_prepare_2) + addi r6,r5,-8 + std r7,0(r3) + addi r9,r3,r8 + cmpldi cr7,r6,7 + addi r7,r4,8 + ble cr7,L(short_path_prepare_1_1) + ld r4,8(r4) + cmpb r8,r4,r8 + cmpdi cr7,r8,0 + bne cr7,L(short_path_prepare_2_1) + std r4,8(r3) + addi r29,r3,16 + addi r5,r5,-16 + /* Neither the null byte was found or total length was reached, + align to 16 bytes and issue a bulk copy/compare. */ + b L(align_to_16b) + + /* In the case of 4k page boundary cross, the algorithm first align + the address to a doubleword, calculate a mask based on alignment + to ignore the bytes and continue using doubleword. */ + .align 4 +L(pagecross): + rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */ + li r6,-1 /* MASK = 0xffffffffffffffffUL. */ + sldi r9,r9,3 /* Calculate padding. */ + ld r7,0(r11) /* Load doubleword from memory. */ +#ifdef __LITTLE_ENDIAN__ + sld r9,r6,r9 /* MASK = MASK << padding. */ +#else + srd r9,r6,r9 /* MASK = MASK >> padding. */ +#endif + orc r9,r7,r9 /* Mask bits that are not part of the + string. */ + li cr7,0 + cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */ + cmpdi cr7,r9,0 + bne cr7,L(short_path_prepare_2) + subf r8,r8,r5 /* Adjust total length. */ + cmpldi cr7,r8,8 /* Check if length was reached. */ + ble cr7,L(short_path_prepare_2) + + /* For next checks we have aligned address, so we check for more + three doublewords to make sure we can read 16 unaligned bytes + to start the bulk copy with 16 aligned addresses. */ + ld cr7,8(r11) + cmpb r9,r7,r9 + cmpdi cr7,r9,0 + bne cr7,L(short_path_prepare_2) + addi cr7,r8,-8 + cmpldi cr7,r7,8 + ble cr7,L(short_path_prepare_2) + ld cr7,16(r11) + cmpb r9,r7,r9 + cmpdi cr7,r9,0 + bne cr7,L(short_path_prepare_2) + addi r8,r8,-16 + cmpldi r7,r8,8 + ble cr7,L(short_path_prepare_2) + ld r8,24(r11) + cmpb r9,r8,r9 + cmpdi r7,r9,0 + bne cr7,L(short_path_prepare_2) + + /* No null byte found in the 32 bytes readed and length not reached, + read source again using unaligned loads and store them. */ + ld r9,0(r4) + addi r29,r3,16 + addi r5,r5,-16 + std r9,0(r3) + ld r9,8(r4) + std r9,8(r3) + + /* Align source to 16 bytes and adjust destiny and size. */ +L(align_to_16b): + rldicl r9,r10,0,60 + rldicr r28,r10,0,59 + add r12,r5,r9 + subf r29,r9,r29 + + /* The bulk read/compare/copy loads two doublewords, compare and merge + in a single register for speed. This is an attempt to speed up the + null-checking process for bigger strings. */ + + cmpldi cr7,r12,15 + ble cr7,L(short_path_prepare_1_2) + + /* Main loop for large sizes, unrolled 2 times to get better use of + pipeline. */ + ld r8,0(28) + ld r10,8(28) + li r9,0 + cmpb r7,r8,r9 + cmpb r9,r10,r9 + or. r6,r9,r7 + bne cr0,L(short_path_prepare_2_3) + addi r5,r12,-16 + addi r4,r28,16 + std r8,0(r29) + std r10,8(r29) + cmpldi cr7,r5,15 + addi r9,r29,16 + ble cr7,L(short_path_1) + mr r11,r28 + mr r6,r29 + li r30,0 + subfic r26,r4,48 + subfic r27,r9,48 + + b L(loop_16b) + + .align 4 +L(loop_start): + ld r31,0(r11) + ld r10,8(r11) + cmpb r0,r31,r7 + cmpb r8,r10,r7 + or. r7,r0,r8 + addi r5,r5,-32 + cmpldi cr7,r5,15 + add r4,r4,r26 + add r9,r9,r27 + bne cr0,L(short_path_prepare_2_2) + add r4,r28,r4 + std r31,0(r6) + add r9,r29,r9 + std r10,8(r6) + ble cr7,L(short_path_1) + +L(loop_16b): + ld r10,16(r11) + ld r0,24(r11) + cmpb r8,r10,r30 + cmpb r7,r0,r30 + or. r7,r8,r7 + addi r12,r12,-32 + cmpldi r7,r12,15 + addi r11,r11,32 + bne cr0,L(short_path_2) + std r10,16(r6) + addi r6,r6,32 + std r0,-8(r6) + bgt cr7,L(loop_start) + + mr r5,r12 + mr r4,r11 + mr r9,r6 + b L(short_path_1) + + .align 4 +L(short_path_prepare_1_1): + mr r5,r6 + mr r4,r7 + b L(short_path_1) +L(short_path_prepare_1_2): + mr r5,r12 + mr r4,r28 + mr r9,r29 + b L(short_path_1) +L(short_path_prepare_2): + mr r9,r3 + b L(short_path_2) +L(short_path_prepare_2_1): + mr r5,r6 + mr r4,r7 + b L(short_path_2) +L(short_path_prepare_2_2): + mr r5,r12 + mr r4,r11 + mr r9,r6 + b L(short_path_2) +L(short_path_prepare_2_3): + mr r5,r12 + mr r4,r28 + mr r9,r29 + b L(short_path_2) +L(zero_pad_loop_b_prepare): + addi r10,r9,8 + rldicl r5,r5,0,61 + b L(zero_pad_loop_b_start) +L(zero_pad_start_prepare_1): + mr r5,r6 + mr r9,r8 + b L(zero_pad_start_1) +END (FUNC_NAME) + +#ifdef USE_AS_STPNCPY +libc_hidden_def (__stpncpy) +#else +libc_hidden_builtin_def (strncpy) +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/strcspn.S b/sysdeps/powerpc/powerpc64/strcspn.S index 3f6aa0a506..1121930b15 100644 --- a/sysdeps/powerpc/powerpc64/power7/strcspn.S +++ b/sysdeps/powerpc/powerpc64/strcspn.S @@ -20,54 +20,42 @@ /* size_t [r3] strcspn (const char [r4] *s, const char [r5] *reject) */ - .machine power7 EALIGN (strcspn, 4, 0) CALL_MCOUNT 3 /* The idea to speed up the algorithm is to create a lookup table for fast check if input character should be considered. For ASCII or ISO-8859-X character sets it has 256 positions. */ - lbz r10,0(r4) - - /* First the table should be cleared and to avoid unaligned accesses - when using the VSX stores the table address is aligned to 16 - bytes. */ - xxlxor v0,v0,v0 /* PPC64 ELF ABI stack is aligned to 16 bytes. */ addi r9,r1,-256 + /* Clear the table with 0 values */ + li r6, 0 + li r8, 4 + mtctr r8 + mr r10, r9 + .align 4 +L(zerohash): + std r6, 0(r10) + std r6, 8(r10) + std r6, 16(r10) + std r6, 24(r10) + std r6, 32(r10) + std r6, 40(r10) + std r6, 48(r10) + std r6, 56(r10) + addi r10, r10, 64 + bdnz L(zerohash) - li r8,48 - li r5,16 - li r6,32 + lbz r10,0(r4) cmpdi cr7,r10,0 /* reject[0] == '\0' ? */ - addi r12,r9,64 - /* Clear the table with 0 values */ - stxvw4x v0,r0,r9 - addi r11,r9,128 - addi r7,r9,192 - stxvw4x v0,r9,r5 - stxvw4x v0,r9,r6 - stxvw4x v0,r9,r8 - stxvw4x v0,r0,r12 - stxvw4x v0,r12,r5 - stxvw4x v0,r12,r6 - stxvw4x v0,r12,r8 - stxvw4x v0,r0,r11 - stxvw4x v0,r11,r5 - stxvw4x v0,r11,r6 - stxvw4x v0,r11,r8 - stxvw4x v0,r0,r7 - stxvw4x v0,r7,r5 - stxvw4x v0,r7,r6 - stxvw4x v0,r7,r8 li r8,1 beq cr7,L(finish_table) /* If reject[0] == '\0' skip */ /* Initialize the table as: for (i=0; reject[i]; i++ table[reject[i]]] = 1 */ - .p2align 4,,15 + .align 4 L(init_table): stbx r8,r9,r10 lbzu r10,1(r4) @@ -93,7 +81,7 @@ L(finish_table): if (table[input[i++]] == 1) return i - 1; } */ - .p2align 4,,15 + .align 4 L(unroll): lbz r8,1(r3) addi r10,r10,4 @@ -121,17 +109,17 @@ L(mainloop): mr r3,r10 blr - .p2align 4,,15 + .align 4 L(end): mr r3,r6 blr - .p2align 4,,15 + .align 4 L(end2): mr r3,r4 blr - .p2align 4,,15 + .align 4 L(end3): mr r3,r5 blr diff --git a/sysdeps/powerpc/powerpc64/power7/strpbrk.S b/sysdeps/powerpc/powerpc64/strpbrk.S index d6204a7754..6b2ad4d1aa 100644 --- a/sysdeps/powerpc/powerpc64/power7/strpbrk.S +++ b/sysdeps/powerpc/powerpc64/strpbrk.S @@ -1,4 +1,4 @@ -/* Optimized strpbrk implementation for PowerPC64/POWER7. +/* Optimized strpbrk implementation for PowerPC64. Copyright (C) 2014 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -20,7 +20,6 @@ /* char [r3] *strpbrk(const char [r4] *s, const char [r5] *accept) */ - .machine power7 EALIGN (strpbrk, 4, 0) CALL_MCOUNT 3 @@ -32,43 +31,31 @@ EALIGN (strpbrk, 4, 0) for fast check if input character should be considered. For ASCII or ISO-8859-X character sets it has 256 positions. */ - /* First the table should be cleared and to avoid unaligned accesses - when using the VSX stores the table address is aligned to 16 - bytes. */ - xxlxor v0,v0,v0 - - /* PPC64 ELF ABI stack is aligned to 16 bytes */ + /* PPC64 ELF ABI stack is aligned to 16 bytes. */ addi r9,r1,-256 - - li r5,16 - li r6,32 - li r8,48 - addi r12,r9,64 /* Clear the table with 0 values */ - stxvw4x v0,r0,r9 - addi r11,r9,128 - addi r7,r9,192 - stxvw4x v0,r9,r5 - li r0,1 - stxvw4x v0,r9,r6 - stxvw4x v0,r9,r8 - stxvw4x v0,r0,r12 - stxvw4x v0,r12,r5 - stxvw4x v0,r12,r6 - stxvw4x v0,r12,r8 - stxvw4x v0,r0,r11 - stxvw4x v0,r11,r5 - stxvw4x v0,r11,r6 - stxvw4x v0,r11,r8 - stxvw4x v0,r0,r7 - stxvw4x v0,r7,r5 - stxvw4x v0,r7,r6 - stxvw4x v0,r7,r8 + li r6, 0 + li r7, 4 + mtctr r7 + mr r8, r9 + .align 4 +L(zerohash): + std r6, 0(r8) + std r6, 8(r8) + std r6, 16(r8) + std r6, 24(r8) + std r6, 32(r8) + std r6, 40(r8) + std r6, 48(r8) + std r6, 56(r8) + addi r8, r8, 64 + bdnz L(zerohash) /* Initialize the table as: for (i=0; accept[i]; i++ table[accept[i]]] = 1 */ - .p2align 4,,15 + li r0,1 + .align 4 L(init_table): stbx r0,r9,r10 lbzu r10,1(r4) @@ -93,7 +80,7 @@ L(finish_table): if (table[input[i++]] == 1) return (s[i -1] ? s + i - 1: NULL); } */ - .p2align 4 + .align 4 L(unroll): lbz r0,1(r3) lbzx r8,r9,r0 @@ -121,7 +108,7 @@ L(mainloop): L(end): blr - .p2align 4 + .align 4 L(checkend): cmpdi cr1,r12,0 mr r3,r7 @@ -131,14 +118,14 @@ L(nullfound): li 3,0 blr - .p2align 4 + .align 4 L(checkend2): cmpdi cr7,r0,0 mr r3,r11 beq cr7,L(nullfound) blr - .p2align 4 + .align 4 L(checkend3): cmpdi cr6,r10,0 mr r3,r5 diff --git a/sysdeps/powerpc/powerpc64/power7/strspn.S b/sysdeps/powerpc/powerpc64/strspn.S index d587a673f2..daf5d5d747 100644 --- a/sysdeps/powerpc/powerpc64/power7/strspn.S +++ b/sysdeps/powerpc/powerpc64/strspn.S @@ -1,4 +1,4 @@ -/* Optimized strspn implementation for PowerPC64/POWER7. +/* Optimized strspn implementation for PowerPC64. Copyright (C) 2014 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -25,8 +25,6 @@ > hashing of needle. > hashing avoids scanning of duplicate entries in needle across the string. - > initializing the hash table with Vector instructions - by quadword access. > unrolling when scanning for character in string across hash table. */ @@ -46,55 +44,36 @@ #include <sysdep.h> -#undef strspn - - .machine power7 EALIGN(strspn, 4, 0) - CALL_MCOUNT 2 - - lbz r10, 0(r4) /* load r10 with needle (r4) */ - addi r9, r1, -256 /* r9 is a hash of 256 bytes */ - - li r5, 16 /* set r5 = 16 as offset */ - li r6, 32 /* set r6 = 32 as offset */ - li r8, 48 /* set r8 = 48 as offset */ - -/*Iniatliaze hash table with Zeroes in double indexed quadword accesses */ - xxlxor v0, v0, v0 /* prepare for initializing hash */ - - stxvd2x v0, r0, r9 /* initialize 1st quadword */ - stxvd2x v0, r9, r5 - stxvd2x v0, r9, r6 - stxvd2x v0, r9, r8 /* initialize 4th quadword */ - - addi r11, r9, 64 /* r11 is index to hash */ - - stxvd2x v0, r0, r11 /* initialize 5th quadword */ - stxvd2x v0, r11, r5 - stxvd2x v0, r11, r6 - stxvd2x v0, r11, r8 /* initialize 8th quadword */ - - addi r11, r9, 128 /* r11 is index to hash */ - - stxvd2x v0, r0, r11 /* initialize 9th quadword */ - stxvd2x v0, r11, r5 - stxvd2x v0, r11, r6 - stxvd2x v0, r11, r8 /* initialize 12th quadword */ - - addi r11, r9, 192 /* r11 is index to hash */ - - stxvd2x v0, r0, r11 /* initialize 13th quadword */ - stxvd2x v0, r11, r5 - stxvd2x v0, r11, r6 - stxvd2x v0, r11, r8 /* initialize 16th quadword */ - + CALL_MCOUNT 3 + + /* PPC64 ELF ABI stack is aligned to 16 bytes. */ + addi r9,r1,-256 + /* Clear the table with 0 values */ + li r6, 0 + li r8, 4 + mtctr r8 + mr r10, r9 + .align 4 +L(zerohash): + std r6, 0(r10) + std r6, 8(r10) + std r6, 16(r10) + std r6, 24(r10) + std r6, 32(r10) + std r6, 40(r10) + std r6, 48(r10) + std r6, 56(r10) + addi r10, r10, 64 + bdnz L(zerohash) + + lbz r10,0(r4) li r8, 1 /* r8=1, marker into hash if found in needle */ - cmpdi cr7, r10, 0 /* accept needle is NULL */ beq cr7, L(skipHashing) /* if needle is NULL, skip hashing */ - .p2align 4 /* align section to 16 byte boundary */ + .align 4 /* align section to 16 byte boundary */ L(hashing): stbx r8, r9, r10 /* update hash with marker for the pivot of the needle */ @@ -106,7 +85,7 @@ L(skipHashing): li r10, 0 /* load counter = 0 */ b L(beginScan) - .p2align 4 /* align section to 16 byte boundary */ + .align 4 /* align section to 16 byte boundary */ L(scanUnroll): lbzx r8, r9, r8 /* load r8 with hash value at index */ cmpwi cr7, r8, 0 /* if we hit marker in hash, we have found diff --git a/sysdeps/powerpc/powerpc64/strtok.S b/sysdeps/powerpc/powerpc64/strtok.S new file mode 100644 index 0000000000..fa816f2950 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/strtok.S @@ -0,0 +1,226 @@ +/* Optimized strtok implementation for PowerPC64. + + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* Performance gains are grabbed through following techniques: + + > hashing of needle. + > hashing avoids scanning of duplicate entries in needle + across the string. + > unrolling when scanning for character in string + across hash table. */ + +/* Algorithm is as below: + 1. A empty hash table/dictionary is created comprising of + 256 ascii character set + 2. When hash entry is found in needle , the hash index + is initialized to 1 + 3. The string is scanned until end and for every character, + its corresponding hash index is compared. + 4. initial length of string (count) until first hit of + accept needle is calculated and moved.(strspn) + 5. The string is again scanned until end and for every character, + its corresponding hash index is compared.(strpbrk) + 6. If hash index is set to 1 for the index of string, + set it to null and set the saveptr to point to the next char. + 7. Otherwise count is incremented and scanning continues + until end of string. */ + +#include <sysdep.h> +#ifdef USE_AS_STRTOK_R +# define FUNC_NAME __strtok_r +#else +# define FUNC_NAME strtok +#endif + +EALIGN(FUNC_NAME, 4, 0) +#ifdef USE_AS_STRTOK_R + CALL_MCOUNT 3 + cmpdi cr7, r3, 0 /* Is input null? */ + bne cr7, L(inputnotNull) + ld r3, 0(r5) /* Load from r5 */ +#else + CALL_MCOUNT 2 + addis r5, r2, .LANCHOR0@toc@ha + cmpdi cr7, r3, 0 /* Is r3 NULL? */ + bne cr7, L(inputnotNull) + ld r3, .LANCHOR0@toc@l(r5) /* Load from saveptr */ +#endif +L(inputnotNull): + mr r7, r3 + cmpdi cr7, r3, 0 + beq cr7, L(returnNULL) + lbz r8, 0(r3) + cmpdi cr7, r8, 0 + beq cr7, L(returnNULL) + + addi r9, r1, -256 /* r9 is a hash of 256 bytes */ + + /*Iniatliaze hash table with Zeroes */ + li r6, 0 + li r8, 4 + mtctr r8 + mr r10, r9 + .align 4 +L(zerohash): + std r6, 0(r10) + std r6, 8(r10) + std r6, 16(r10) + std r6, 24(r10) + std r6, 32(r10) + std r6, 40(r10) + std r6, 48(r10) + std r6, 56(r10) + addi r10, r10, 64 + bdnz L(zerohash) + + + lbz r10, 0(r4) /* load r10 with needle (r4) */ + li r8, 1 /* r8=1, marker into hash if found in + needle */ + + cmpdi cr7, r10, 0 /* accept needle is NULL */ + beq cr7, L(skipHashing) /* if needle is NULL, skip hashing */ + + .align 4 /* align section to 16 byte boundary */ +L(hashing): + stbx r8, r9, r10 /* update hash with marker for the pivot of + the needle */ + lbzu r10, 1(r4) /* load needle into r10 and update to next */ + cmpdi cr7, r10, 0 /* if needle is has reached NULL, continue */ + bne cr7, L(hashing) /* loop to hash the needle */ + +L(skipHashing): + b L(beginScan) + + .align 4 /* align section to 16 byte boundary */ +L(scanUnroll): + lbzx r8, r9, r8 /* load r8 with hash value at index */ + cmpwi cr7, r8, 0 /* check the hash value */ + beq cr7, L(ret1stIndex) /* we have hit accept needle */ + + lbz r8, 1(r7) /* load string[1] into r8 */ + lbzx r8, r9, r8 /* load r8 with hash value at index */ + cmpwi cr7, r8, 0 /* check the hash value */ + beq cr7, L(ret2ndIndex) /* we have hit accept needle */ + + lbz r8, 2(r7) /* load string[1] into r8 */ + lbzx r8, r9, r8 /* load r8 with hash value at index */ + cmpwi cr7, r8, 0 /* check the hash value */ + beq cr7, L(ret3rdIndex) /* we have hit accept needle */ + + lbz r8, 3(r7) /* load string[1] into r8 */ + addi r7, r7, 4 + lbzx r8, r9, r8 /* load r8 with hash value at index */ + cmpwi cr7, r8, 0 /* check the hash value */ + beq cr7,L(ret4thIndex) /* we have hit accept needle */ + +L(beginScan): + lbz r8, 0(r7) /* load string[0] into r8 */ + addi r6, r7, 1 + addi r11, r7, 2 + addi r4, r7, 3 + cmpdi cr7, r8, 0 /* check if its null */ + bne cr7, L(scanUnroll) /* continue scanning */ + +L(ret1stIndex): + mr r3, r7 + b L(next) +L(ret2ndIndex): + mr r3, r6 + b L(next) +L(ret3rdIndex): + mr r3, r11 + b L(next) +L(ret4thIndex): + mr r3, r4 +L(next): + mr r7, r3 + lbz r8, 0(r7) + cmpdi cr7, r8, 0 + beq cr7, L(returnNULL) + li r8, 1 + li r10, 0 /* load counter = 0 */ + stbx r8, r9, r10 /* update hash for NULL */ + b L(mainloop) + +L(unroll): + lbz r8, 1(r7) /* load string[1] into r8 */ + lbzx r8, r9, r8 /* load r8 with hash value at index */ + cmpwi r7, r8, 1 /* check the hash */ + beq cr7, L(foundat1st) /* we have hit accept needle */ + lbz r8, 2(r7) + lbzx r8, r9, r8 + cmpwi cr7, r8, 1 + beq cr7, L(foundat2nd) + lbz r8, 3(r7) + addi r7, r7, 4 + lbzx r8, r9, r8 + cmpwi cr7, r8, 1 + beq cr7, L(foundat3rd) +L(mainloop): + lbz r8, 0(r7) + addi r6, r7, 1 + addi r11, r7, 2 + addi r4, r7, 3 + lbzx r8, r9, r8 + cmpwi cr7, r8, 1 + bne cr7, L(unroll) /* continue scanning */ + + b L(found) +L(foundat1st): + mr r7, r6 + b L(found) +L(foundat2nd): + mr r7, r11 + b L(found) +L(foundat3rd): + mr r7, r4 +L(found): + lbz r8, 0(r7) + cmpdi cr7, r8, 0 + beq cr7, L(end) + li r10, 0 + stb r10, 0(r7) /* Terminate string */ + addi r7, r7, 1 /* Store the pointer to the next char */ +L(end): +#ifdef USE_AS_STRTOK_R + std r7, 0(r5) /* Update saveptr */ +#else + std r7, .LANCHOR0@toc@l(r5) +#endif + blr /* done */ +L(returnNULL): +#ifndef USE_AS_STRTOK_R + li r7, 0 +#endif + li r3, 0 /* return NULL */ + b L(end) +END(FUNC_NAME) +#ifdef USE_AS_STRTOK_R +libc_hidden_builtin_def (strtok_r) +#else + .section ".bss" + .align 3 + .set .LANCHOR0,. + 0 + .type olds, @object + .size olds, 8 +olds: + .zero 8 +libc_hidden_builtin_def (strtok) +#endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c b/sysdeps/powerpc/powerpc64/strtok_r.S index 3609d93ad2..6e5d301035 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c +++ b/sysdeps/powerpc/powerpc64/strtok_r.S @@ -1,4 +1,4 @@ -/* Multiple versions of strcspn. PowerPC64 version. +/* Optimized strtok_r implementation for PowerPC64. Copyright (C) 2014 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,16 +16,9 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#ifndef NOT_IN_libc -# include <string.h> -# include <shlib-compat.h> -# include "init-arch.h" +#define USE_AS_STRTOK_R +#include <sysdeps/powerpc/powerpc64/strtok.S> -extern __typeof (strcspn) __strcspn_ppc attribute_hidden; -extern __typeof (strcspn) __strcspn_power7 attribute_hidden; - -libc_ifunc (strcspn, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __strcspn_power7 - : __strcspn_ppc); -#endif +weak_alias (__strtok_r, strtok_r) +libc_hidden_def (__strtok_r) +libc_hidden_builtin_def (strtok_r) diff --git a/sysdeps/powerpc/powerpc64/sysdep.h b/sysdeps/powerpc/powerpc64/sysdep.h index b28fb9d8aa..1636899c1a 100644 --- a/sysdeps/powerpc/powerpc64/sysdep.h +++ b/sysdeps/powerpc/powerpc64/sysdep.h @@ -283,7 +283,23 @@ LT_LABELSUFFIX(name,_name_end): ; \ TRACEBACK_MASK(name,mask) \ END_2(name) +#if !defined IS_IN_rtld && defined (ENABLE_LOCK_ELISION) +# define ABORT_TRANSACTION \ + cmpdi 13,0; \ + beq 1f; \ + lwz 0,TM_CAPABLE(13); \ + cmpwi 0,0; \ + beq 1f; \ + li 11,_ABORT_SYSCALL; \ + tabort. 11; \ + .align 4; \ +1: +#else +# define ABORT_TRANSACTION +#endif + #define DO_CALL(syscall) \ + ABORT_TRANSACTION \ li 0,syscall; \ sc diff --git a/sysdeps/powerpc/sysdep.h b/sysdeps/powerpc/sysdep.h index e6627c071f..04d109f0e0 100644 --- a/sysdeps/powerpc/sysdep.h +++ b/sysdeps/powerpc/sysdep.h @@ -21,6 +21,10 @@ */ #define _SYSDEPS_SYSDEP_H 1 #include <bits/hwcap.h> +#ifdef ENABLE_LOCK_ELISION +#include <tls.h> +#include <htm.h> +#endif #define PPC_FEATURE_970 (PPC_FEATURE_POWER4 + PPC_FEATURE_HAS_ALTIVEC) @@ -164,4 +168,22 @@ #define ALIGNARG(log2) log2 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name +#else + +/* Linux kernel powerpc documentation [1] states issuing a syscall inside a + transaction is not recommended and may lead to undefined behavior. It + also states syscalls do not abort transactions. To avoid such traps, + we abort transaction just before syscalls. + + [1] Documentation/powerpc/transactional_memory.txt [Syscalls] */ +#if !defined IS_IN_rtld && defined (ENABLE_LOCK_ELISION) +# define ABORT_TRANSACTION \ + ({ \ + if (THREAD_GET_TM_CAPABLE ()) \ + __builtin_tabort (_ABORT_SYSCALL); \ + }) +#else +# define ABORT_TRANSACTION +#endif + #endif /* __ASSEMBLER__ */ diff --git a/sysdeps/unix/sysv/linux/powerpc/Makefile b/sysdeps/unix/sysv/linux/powerpc/Makefile index 28f7165815..974a1bf749 100644 --- a/sysdeps/unix/sysv/linux/powerpc/Makefile +++ b/sysdeps/unix/sysv/linux/powerpc/Makefile @@ -34,4 +34,6 @@ endif ifeq ($(subdir),nptl) libpthread-routines += sysdep +libpthread-sysdep_routines += elision-lock elision-unlock elision-timed \ + elision-trylock endif diff --git a/sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h b/sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h index 4e9c5184aa..998f6d42b8 100644 --- a/sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h +++ b/sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h @@ -90,14 +90,23 @@ typedef union binary compatibility. */ int __kind; #if __WORDSIZE == 64 - int __spins; + short __spins; + short __elision; __pthread_list_t __list; # define __PTHREAD_MUTEX_HAVE_PREV 1 +# define __PTHREAD_SPINS 0, 0 #else unsigned int __nusers; __extension__ union { - int __spins; + struct + { + short __espins; + short __elision; +# define __spins __elision_data.__espins +# define __elision __elision_data.__elision +# define __PTHREAD_SPINS { 0, 0 } + } __elision_data; __pthread_slist_t __list; }; #endif @@ -106,9 +115,6 @@ typedef union long int __align; } pthread_mutex_t; -/* Mutex __spins initializer used by PTHREAD_MUTEX_INITIALIZER. */ -#define __PTHREAD_SPINS 0 - typedef union { char __size[__SIZEOF_PTHREAD_MUTEXATTR_T]; @@ -166,11 +172,13 @@ typedef union unsigned int __nr_writers_queued; int __writer; int __shared; - unsigned long int __pad1; + unsigned char __rwelision; + unsigned char __pad1[7]; unsigned long int __pad2; /* FLAGS must stay at this position in the structure to maintain binary compatibility. */ unsigned int __flags; +# define __PTHREAD_RWLOCK_ELISION_EXTRA 0, {0, 0, 0, 0, 0, 0, 0 } } __data; # else struct @@ -181,20 +189,20 @@ typedef union unsigned int __writer_wakeup; unsigned int __nr_readers_queued; unsigned int __nr_writers_queued; - unsigned char __pad1; + unsigned char __rwelision; unsigned char __pad2; unsigned char __shared; /* FLAGS must stay at this position in the structure to maintain binary compatibility. */ unsigned char __flags; int __writer; +#define __PTHREAD_RWLOCK_ELISION_EXTRA 0 } __data; # endif char __size[__SIZEOF_PTHREAD_RWLOCK_T]; long int __align; } pthread_rwlock_t; -#define __PTHREAD_RWLOCK_ELISION_EXTRA 0 typedef union { diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-conf.c b/sysdeps/unix/sysv/linux/powerpc/elision-conf.c new file mode 100644 index 0000000000..70fbbb2215 --- /dev/null +++ b/sysdeps/unix/sysv/linux/powerpc/elision-conf.c @@ -0,0 +1,83 @@ +/* elision-conf.c: Lock elision tunable parameters. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include "config.h" +#include <pthreadP.h> +#include <elision-conf.h> +#include <unistd.h> +#include <dl-procinfo.h> + +/* Reasonable initial tuning values, may be revised in the future. + This is a conservative initial value. */ + +struct elision_config __elision_aconf = + { + /* How many times to use a non-transactional lock after a transactional + failure has occurred because the lock is already acquired. Expressed + in number of lock acquisition attempts. */ + .skip_lock_busy = 3, + /* How often to not attempt to use elision if a transaction aborted due + to reasons other than other threads' memory accesses. Expressed in + number of lock acquisition attempts. */ + .skip_lock_internal_abort = 3, + /* How often to not attempt to use elision if a lock used up all retries + without success. Expressed in number of lock acquisition attempts. */ + .skip_lock_out_of_tbegin_retries = 3, + /* How often we retry using elision if there is chance for the transaction + to finish execution (e.g., it wasn't aborted due to the lock being + already acquired. */ + .try_tbegin = 3, + /* Same as SKIP_LOCK_INTERNAL_ABORT but for trylock. */ + .skip_trylock_internal_abort = 3, + }; + +/* Force elision for all new locks. This is used to decide whether existing + DEFAULT locks should be automatically use elision in pthread_mutex_lock(). + Disabled for suid programs. Only used when elision is available. */ + +int __pthread_force_elision attribute_hidden; + +/* Initialize elision. */ + +static void +elision_init (int argc __attribute__ ((unused)), + char **argv __attribute__ ((unused)), + char **environ) +{ +#ifdef ENABLE_LOCK_ELISION + int elision_available = (GLRO (dl_hwcap2) & PPC_FEATURE2_HAS_HTM) ? 1 : 0; + __pthread_force_elision = __libc_enable_secure ? 0 : elision_available; +#endif + if (!__pthread_force_elision) + /* Disable elision on rwlocks. */ + __elision_aconf.try_tbegin = 0; +} + +#ifdef SHARED +# define INIT_SECTION ".init_array" +# define MAYBE_CONST +#else +# define INIT_SECTION ".preinit_array" +# define MAYBE_CONST const +#endif + +void (*MAYBE_CONST __pthread_init_array []) (int, char **, char **) + __attribute__ ((section (INIT_SECTION), aligned (sizeof (void *)))) = +{ + &elision_init +}; diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-conf.h b/sysdeps/unix/sysv/linux/powerpc/elision-conf.h new file mode 100644 index 0000000000..fc9994de97 --- /dev/null +++ b/sysdeps/unix/sysv/linux/powerpc/elision-conf.h @@ -0,0 +1,42 @@ +/* elision-conf.h: Lock elision tunable parameters. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _ELISION_CONF_H +#define _ELISION_CONF_H 1 + +#include <pthread.h> +#include <time.h> + +/* Should make sure there is no false sharing on this. */ +struct elision_config +{ + int skip_lock_busy; + int skip_lock_internal_abort; + int skip_lock_out_of_tbegin_retries; + int try_tbegin; + int skip_trylock_internal_abort; +} __attribute__ ((__aligned__ (128))); + +extern struct elision_config __elision_aconf attribute_hidden; + +extern int __pthread_force_elision attribute_hidden; + +/* Tell the test suite to test elision for this architecture. */ +#define HAVE_ELISION 1 + +#endif diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c new file mode 100644 index 0000000000..8c13a29ab2 --- /dev/null +++ b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c @@ -0,0 +1,86 @@ +/* elision-lock.c: Elided pthread mutex lock. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <stdio.h> +#include <pthread.h> +#include <pthreadP.h> +#include <lowlevellock.h> +#include <elision-conf.h> +#include "htm.h" + +#if !defined(LLL_LOCK) && !defined(EXTRAARG) +/* Make sure the configuration code is always linked in for static + libraries. */ +#include "elision-conf.c" +#endif + +#ifndef EXTRAARG +# define EXTRAARG +#endif +#ifndef LLL_LOCK +# define LLL_LOCK(a,b) lll_lock(a,b), 0 +#endif + +#define aconf __elision_aconf + +/* Adaptive lock using transactions. + By default the lock region is run as a transaction, and when it + aborts or the lock is busy the lock adapts itself. */ + +int +__lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared) +{ + if (*adapt_count > 0) + { + (*adapt_count)--; + goto use_lock; + } + + int try_begin = aconf.try_tbegin; + while (1) + { + if (__builtin_tbegin (0)) + { + if (*lock == 0) + return 0; + /* Lock was busy. Fall back to normal locking. */ + __builtin_tabort (_ABORT_LOCK_BUSY); + } + else + { + /* A persistent failure indicates that a retry will probably + result in another failure. Use normal locking now and + for the next couple of calls. */ + if (try_begin-- <= 0 + || _TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ())) + { + if (aconf.skip_lock_internal_abort > 0) + *adapt_count = aconf.skip_lock_internal_abort; + goto use_lock; + } + /* Same logic as above, but for for a number of temporary failures + in a row. */ + else if (aconf.skip_lock_out_of_tbegin_retries > 0 + && aconf.try_tbegin > 0) + *adapt_count = aconf.skip_lock_out_of_tbegin_retries; + } + } + +use_lock: + return LLL_LOCK ((*lock), pshared); +} diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c b/sysdeps/unix/sysv/linux/powerpc/elision-timed.c index 8b05536ae1..7d5de9b681 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c +++ b/sysdeps/unix/sysv/linux/powerpc/elision-timed.c @@ -1,4 +1,4 @@ -/* Multiple versions of strpbrk. PowerPC64 version. +/* elision-timed.c: Lock elision timed lock. Copyright (C) 2014 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,16 +16,13 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#ifndef NOT_IN_libc -# include <string.h> -# include <shlib-compat.h> -# include "init-arch.h" +#include <time.h> +#include <elision-conf.h> +#include "lowlevellock.h" -extern __typeof (strpbrk) __strpbrk_ppc attribute_hidden; -extern __typeof (strpbrk) __strpbrk_power7 attribute_hidden; +#define __lll_lock_elision __lll_timedlock_elision +#define EXTRAARG const struct timespec *t, +#undef LLL_LOCK +#define LLL_LOCK(a, b) lll_timedlock(a, t, b) -libc_ifunc (strpbrk, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __strpbrk_power7 - : __strpbrk_ppc); -#endif +#include "elision-lock.c" diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c new file mode 100644 index 0000000000..4a310d5945 --- /dev/null +++ b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c @@ -0,0 +1,68 @@ +/* elision-trylock.c: Lock eliding trylock for pthreads. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <pthread.h> +#include <pthreadP.h> +#include <lowlevellock.h> +#include <elision-conf.h> +#include "htm.h" + +#define aconf __elision_aconf + +/* Try to elide a futex trylock. FUTEX is the futex variable. ADAPT_COUNT is + the adaptation counter in the mutex. */ + +int +__lll_trylock_elision (int *futex, short *adapt_count) +{ + /* Implement POSIX semantics by forbiding nesting elided trylocks. */ + __builtin_tabort (_ABORT_NESTED_TRYLOCK); + + /* Only try a transaction if it's worth it. */ + if (*adapt_count > 0) + { + (*adapt_count)--; + goto use_lock; + } + + if (__builtin_tbegin (0)) + { + if (*futex == 0) + return 0; + + /* Lock was busy. Fall back to normal locking. */ + __builtin_tabort (_ABORT_LOCK_BUSY); + } + else + { + if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ())) + { + /* A persistent failure indicates that a retry will probably + result in another failure. Use normal locking now and + for the next couple of calls. */ + if (aconf.skip_trylock_internal_abort > 0) + *adapt_count = aconf.skip_trylock_internal_abort; + } + + if (aconf.skip_lock_busy > 0) + *adapt_count = aconf.skip_lock_busy; + } + +use_lock: + return lll_trylock (*futex); +} diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c index bf8c877ec2..59d46bb43a 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/strspn.c +++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c @@ -1,4 +1,4 @@ -/* Multiple versions of strspn. PowerPC64 version. +/* elision-unlock.c: Commit an elided pthread lock. Copyright (C) 2014 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,16 +16,17 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#ifndef NOT_IN_libc -# include <string.h> -# include <shlib-compat.h> -# include "init-arch.h" +#include "pthreadP.h" +#include "lowlevellock.h" +#include "htm.h" -extern __typeof (strspn) __strspn_ppc attribute_hidden; -extern __typeof (strspn) __strspn_power7 attribute_hidden; - -libc_ifunc (strspn, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __strspn_power7 - : __strspn_ppc); -#endif +int +__lll_unlock_elision(int *lock, int pshared) +{ + /* When the lock was free we're in a transaction. */ + if (*lock == 0) + __builtin_tend (0); + else + lll_unlock ((*lock), pshared); + return 0; +} diff --git a/sysdeps/unix/sysv/linux/powerpc/force-elision.h b/sysdeps/unix/sysv/linux/powerpc/force-elision.h new file mode 100644 index 0000000000..3da576b944 --- /dev/null +++ b/sysdeps/unix/sysv/linux/powerpc/force-elision.h @@ -0,0 +1,28 @@ +/* force-elision.h: Automatic enabling of elision for mutexes + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifdef ENABLE_LOCK_ELISION +/* Automatically enable elision for existing user lock kinds. */ +#define FORCE_ELISION(m, s) \ + if (__pthread_force_elision \ + && (m->__data.__kind & PTHREAD_MUTEX_ELISION_FLAGS_NP) == 0) \ + { \ + mutex->__data.__kind |= PTHREAD_MUTEX_ELISION_NP; \ + s; \ + } +#endif diff --git a/sysdeps/unix/sysv/linux/powerpc/htm.h b/sysdeps/unix/sysv/linux/powerpc/htm.h new file mode 100644 index 0000000000..4a570bea6e --- /dev/null +++ b/sysdeps/unix/sysv/linux/powerpc/htm.h @@ -0,0 +1,138 @@ +/* Shared HTM header. Emulate transactional execution facility intrinsics for + compilers and assemblers that do not support the intrinsics and instructions + yet. + + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _HTM_H +#define _HTM_H 1 + +#ifdef __ASSEMBLER__ + +/* tbegin. */ +.macro TBEGIN + .long 0x7c00051d +.endm + +/* tend. 0 */ +.macro TEND + .long 0x7c00055d +.endm + +/* tabort. code */ +.macro TABORT code + .byte 0x7c + .byte \code + .byte 0x07 + .byte 0x1d +.endm + +/*"TEXASR - Transaction EXception And Summary Register" + mfspr %dst,130 */ +.macro TEXASR dst + mfspr \dst,130 +.endm + +#else + +#include <endian.h> + +/* Official HTM intrinsics interface matching GCC, but works + on older GCC compatible compilers and binutils. + We should somehow detect if the compiler supports it, because + it may be able to generate slightly better code. */ + +#define TBEGIN ".long 0x7c00051d" +#define TEND ".long 0x7c00055d" +#if __BYTE_ORDER == __LITTLE_ENDIAN +# define TABORT ".byte 0x1d,0x07,%1,0x7c" +#else +# define TABORT ".byte 0x7c,%1,0x07,0x1d" +#endif + +#define __force_inline inline __attribute__((__always_inline__)) + +#ifndef __HTM__ + +#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \ + (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1)) +#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \ + _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1) + +#define _tbegin() \ + ({ unsigned int __ret; \ + asm volatile ( \ + TBEGIN "\t\n" \ + "mfcr %0\t\n" \ + "rlwinm %0,%0,3,1\t\n" \ + "xori %0,%0,1\t\n" \ + : "=r" (__ret) : \ + : "cr0", "memory"); \ + __ret; \ + }) + +#define _tend() \ + ({ unsigned int __ret; \ + asm volatile ( \ + TEND "\t\n" \ + "mfcr %0\t\n" \ + "rlwinm %0,%0,3,1\t\n" \ + "xori %0,%0,1\t\n" \ + : "=r" (__ret) : \ + : "cr0", "memory"); \ + __ret; \ + }) + +#define _tabort(__code) \ + ({ unsigned int __ret; \ + asm volatile ( \ + TABORT "\t\n" \ + "mfcr %0\t\n" \ + "rlwinm %0,%0,3,1\t\n" \ + "xori %0,%0,1\t\n" \ + : "=r" (__ret) : "r" (__code) \ + : "cr0", "memory"); \ + __ret; \ + }) + +#define _texasru() \ + ({ unsigned long __ret; \ + asm volatile ( \ + "mfspr %0,131\t\n" \ + : "=r" (__ret)); \ + __ret; \ + }) + +#define __builtin_tbegin(tdb) _tbegin () +#define __builtin_tend(nested) _tend () +#define __builtin_tabort(abortcode) _tabort (abortcode) +#define __builtin_get_texasru() _texasru () + +#else +# include <htmintrin.h> +#endif /* __HTM__ */ + +#endif /* __ASSEMBLER__ */ + +/* Definitions used for TEXASR Failure code (bits 0:6), they need to be even + because tabort. always sets the first bit. */ +#define _ABORT_LOCK_BUSY 0x3f /* Lock already used. */ +#define _ABORT_NESTED_TRYLOCK 0x3e /* Write operation in trylock. */ +#define _ABORT_SYSCALL 0x3d /* Syscall issued. */ + +#endif diff --git a/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h index a651d23c50..0e930d00bc 100644 --- a/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h +++ b/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2003-2014 Free Software Foundation, Inc. +/* Copyright (C) 2003-2015 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Paul Mackerras <paulus@au.ibm.com>, 2003. @@ -339,4 +339,28 @@ extern int __lll_timedwait_tid (int *, const struct timespec *) __res; \ }) +/* Transactional lock elision definitions. */ +extern int __lll_timedlock_elision + (int *futex, short *adapt_count, const struct timespec *timeout, int private) + attribute_hidden; + +#define lll_timedlock_elision(futex, adapt_count, timeout, private) \ + __lll_timedlock_elision(&(futex), &(adapt_count), timeout, private) + +extern int __lll_lock_elision (int *futex, short *adapt_count, int private) + attribute_hidden; + +extern int __lll_unlock_elision(int *lock, int private) + attribute_hidden; + +extern int __lll_trylock_elision(int *lock, short *adapt_count) + attribute_hidden; + +#define lll_lock_elision(futex, adapt_count, private) \ + __lll_lock_elision (&(futex), &(adapt_count), private) +#define lll_unlock_elision(futex, private) \ + __lll_unlock_elision (&(futex), private) +#define lll_trylock_elision(futex, adapt_count) \ + __lll_trylock_elision (&(futex), &(adapt_count)) + #endif /* lowlevellock.h */ diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h b/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h index 1a5e37a1d9..0947ca34a6 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h @@ -194,6 +194,7 @@ register long int r11 __asm__ ("r11"); \ register long int r12 __asm__ ("r12"); \ LOADARGS_##nr(name, args); \ + ABORT_TRANSACTION; \ __asm__ __volatile__ \ ("sc \n\t" \ "mfcr %0" \ diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h index 93e454e902..a3cc3025e0 100644 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h @@ -201,6 +201,7 @@ register long int r7 __asm__ ("r7"); \ register long int r8 __asm__ ("r8"); \ LOADARGS_##nr (name, ##args); \ + ABORT_TRANSACTION; \ __asm__ __volatile__ \ ("sc\n\t" \ "mfcr %0\n\t" \ diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_cond_lock.c index 72b75acff7..aa6cf9a79e 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S +++ b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_cond_lock.c @@ -1,5 +1,4 @@ -/* Optimized bzero implementation for PowerPC64/POWER4. - Copyright (C) 2013-2014 Free Software Foundation, Inc. +/* Copyright (C) 2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,11 +15,8 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <sysdep.h> +/* The cond lock is not actually elided yet, but we still need to handle + already elided locks. */ +#include <elision-conf.h> -ENTRY (__bzero_power4) - CALL_MCOUNT 3 - mr r5,r4 - li r4,0 - b __memset_power4 -END_GEN_TB (__bzero_power4,TB_TOCLESS) +#include <nptl/pthread_mutex_cond_lock.c> diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_lock.c index d0917c5e66..6fd6a9866f 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S +++ b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_lock.c @@ -1,5 +1,5 @@ -/* Optimized bzero implementation for PowerPC64/POWER6. - Copyright (C) 2013-2014 Free Software Foundation, Inc. +/* Elided version of pthread_mutex_lock. + Copyright (C) 2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,11 +16,7 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <sysdep.h> +#include <elision-conf.h> +#include <force-elision.h> -ENTRY (__bzero_power6) - CALL_MCOUNT 3 - mr r5,r4 - li r4,0 - b __memset_power6 -END_GEN_TB (__bzero_power6,TB_TOCLESS) +#include <nptl/pthread_mutex_lock.c> diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_timedlock.c index 0ec285a9bd..d0e6537ecc 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S +++ b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_timedlock.c @@ -1,5 +1,5 @@ -/* Optimized bzero implementation for PowerPC64/POWER7. - Copyright (C) 2013-2014 Free Software Foundation, Inc. +/* Elided version of pthread_mutex_timedlock. + Copyright (C) 2014 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -16,11 +16,7 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <sysdep.h> +#include <elision-conf.h> +#include <force-elision.h> -ENTRY (__bzero_power7) - CALL_MCOUNT 3 - mr r5,r4 - li r4,0 - b __memset_power7 -END_GEN_TB (__bzero_power7,TB_TOCLESS) +#include <nptl/pthread_mutex_timedlock.c> diff --git a/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_trylock.c b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_trylock.c new file mode 100644 index 0000000000..ea8a8fff93 --- /dev/null +++ b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_trylock.c @@ -0,0 +1,22 @@ +/* Elided version of pthread_mutex_trylock. + Copyright (C) 2014 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <elision-conf.h> +#include <force-elision.h> + +#include <nptl/pthread_mutex_trylock.c> diff --git a/sysdeps/unix/sysv/linux/powerpc/syscall.S b/sysdeps/unix/sysv/linux/powerpc/syscall.S index 346e962240..d78eee76f1 100644 --- a/sysdeps/unix/sysv/linux/powerpc/syscall.S +++ b/sysdeps/unix/sysv/linux/powerpc/syscall.S @@ -18,6 +18,7 @@ #include <sysdep.h> ENTRY (syscall) + ABORT_TRANSACTION mr r0,r3 mr r3,r4 mr r4,r5 |