about summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog312
-rw-r--r--NEWS11
-rw-r--r--benchtests/bench-memset.c5
-rw-r--r--benchtests/bench-strcpy.c16
-rw-r--r--csu/tst-atomic.c16
-rw-r--r--elf/get-dynamic-info.h4
-rw-r--r--localedata/locales/bo_CN3
-rw-r--r--localedata/locales/bo_IN2
-rw-r--r--sysdeps/ieee754/dbl-64/Makefile1
-rw-r--r--sysdeps/powerpc/bits/atomic.h51
-rw-r--r--sysdeps/powerpc/nptl/elide.h111
-rw-r--r--sysdeps/powerpc/nptl/tcb-offsets.sym1
-rw-r--r--sysdeps/powerpc/nptl/tls.h19
-rw-r--r--sysdeps/powerpc/powerpc32/bits/atomic.h8
-rw-r--r--sysdeps/powerpc/powerpc32/sysdep.h16
-rw-r--r--sysdeps/powerpc/powerpc64/bits/atomic.h31
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/Makefile20
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/bzero.c11
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c48
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/memset-power4.S4
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/memset-power6.S4
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/memset-power7.S3
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/memset-power8.S43
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/memset.c11
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S (renamed from sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S)16
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S39
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/stpncpy.c7
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c (renamed from sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c)20
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strcat.c7
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S (renamed from sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S)16
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strcmp.c7
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S (renamed from sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S)16
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strcpy.c7
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c (renamed from sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c)21
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S40
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strncmp.c9
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S (renamed from sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S)18
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strncpy.c7
-rw-r--r--sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c33
-rw-r--r--sysdeps/powerpc/powerpc64/power4/memset.S5
-rw-r--r--sysdeps/powerpc/powerpc64/power6/memset.S5
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcmp.S870
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memset.S5
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcmp.S197
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strcpy.S327
-rw-r--r--sysdeps/powerpc/powerpc64/power7/strncat.S228
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S7
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S7
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S7
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S7
-rw-r--r--sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S6
-rw-r--r--sysdeps/powerpc/powerpc64/power8/memset.S451
-rw-r--r--sysdeps/powerpc/powerpc64/power8/stpcpy.S24
-rw-r--r--sysdeps/powerpc/powerpc64/power8/stpncpy.S20
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcmp.S257
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strcpy.S262
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncmp.S323
-rw-r--r--sysdeps/powerpc/powerpc64/power8/strncpy.S424
-rw-r--r--sysdeps/powerpc/powerpc64/strcspn.S (renamed from sysdeps/powerpc/powerpc64/power7/strcspn.S)58
-rw-r--r--sysdeps/powerpc/powerpc64/strpbrk.S (renamed from sysdeps/powerpc/powerpc64/power7/strpbrk.S)61
-rw-r--r--sysdeps/powerpc/powerpc64/strspn.S (renamed from sysdeps/powerpc/powerpc64/power7/strspn.S)73
-rw-r--r--sysdeps/powerpc/powerpc64/strtok.S226
-rw-r--r--sysdeps/powerpc/powerpc64/strtok_r.S (renamed from sysdeps/powerpc/powerpc64/multiarch/strcspn.c)19
-rw-r--r--sysdeps/powerpc/powerpc64/sysdep.h16
-rw-r--r--sysdeps/powerpc/sysdep.h22
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/Makefile2
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h24
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/elision-conf.c83
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/elision-conf.h42
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/elision-lock.c107
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/elision-timed.c (renamed from sysdeps/powerpc/powerpc64/multiarch/strpbrk.c)21
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/elision-trylock.c68
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/elision-unlock.c (renamed from sysdeps/powerpc/powerpc64/multiarch/strspn.c)27
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/force-elision.h28
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/htm.h138
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/lowlevellock.h26
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h1
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h1
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/pthread_mutex_cond_lock.c (renamed from sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S)14
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/pthread_mutex_lock.c (renamed from sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S)14
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/pthread_mutex_timedlock.c (renamed from sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S)14
-rw-r--r--sysdeps/unix/sysv/linux/powerpc/pthread_mutex_trylock.c22
82 files changed, 4157 insertions, 1396 deletions
diff --git a/ChangeLog b/ChangeLog
index ccce486c9d..b19db3c9db 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,315 @@
+2015-03-10  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/ieee754/dbl-64/Makefile (CFLAGS-e_pow.c): Add
+	$(config-cflags-nofma).
+
+2015-02-23  Paul Pluzhnikov  <ppluzhnikov@google.com>
+
+	[BZ #16618]
+	* stdio-common/tst-sscanf.c (main): Test for buffer overflow.
+	* stdio-common/vfscanf.c (_IO_vfscanf_internal): Compute needed
+	size in bytes. Store needed elements in wpmax. Use needed size
+	in bytes for extend_alloca.
+
+2015-02-12  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/unix/sysv/linux/powerpc/htm.h [TABORT]: Fix encoding for
+	little endian.
+
+2014-01-20  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/nptl/tls.h (tcbhead_t): Add tm_capable field.
+	(TLS_INIT_TP): Add tm_capable initialization.
+	(TLS_DEFINE_INIT_TP): Likewise.
+	(THREAD_GET_TM_CAPABLE): New file: get tm_capable field value from
+	TCB.
+	(THREAD_SET_TM_CAPABLE): New file: set tm_capable field value in TCB.
+	* sysdeps/powerpc/nptl/tcb-offsets.sym (TM_CAPABLE): Add field offset
+	calculation.
+	* sysdeps/powerpc/powerpc32/sysdep.h (DO_CALL): Abort hardware
+	transactoion is lock elision is built and TCB tm_capable is set.
+	* sysdeps/powerpc/powerpc64/sysdep.h (DO_CALL): Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h
+	(INTERNAL_SYSCALL_NCS): Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h
+	(INTERNAL_SYSCALL_NCS): Likewise.
+	* sysdeps/powerpc/sysdep.h (ABORT_TRANSACTION): New define.
+
+	* sysdeps/powerpc/nptl/elide.h: New file: generic lock elision support
+	for powerpc.
+	* sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h
+	[pthread_rwlock_t] (__pad1): Change size to 7 bytes in 64 bits case
+	and remove it for 32 bits case.
+	[pthread_rwlock_t] (__rwelision): New field for lock elision.
+	(__PTHREAD_RWLOCK_ELISION_EXTRA): Adjust for new lock elision field
+	initialization.
+	* sysdeps/unix/sysv/linux/powerpc/elision-conf.c (elision_init):
+	Disable lock elision with rdlocks if elision is not available.
+
+	* sysdeps/unix/sysv/linux/powerpc/Makefile [nptl]
+	(sysdep_routines): Add lock elision objects.
+	* sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h
+	[pthread_mutex_t] (__spins): Rework to add lock elision field.
+	[pthread_mutex_t] (__elision): Add field.
+	[__PTHREAD_SPINS]: Adjust to init lock elision field.
+	* sysdeps/unix/sysv/linux/powerpc/elision-conf.c: New file: lock
+	elision definitions for powerpc.
+	* sysdeps/unix/sysv/linux/powerpc/elision-lock.c: New file:
+	implementation of lock elision for powerpc.
+	* sysdeps/unix/sysv/linux/powerpc/elision-timed.c: New file:
+	implementation of timed lock elision for powerpc.
+	* sysdeps/unix/sysv/linux/powerpc/elision-trylock.c: New file:
+	implementation of trylock with lock elision for powerpc.
+	* sysdeps/unix/sysv/linux/powerpc/elision-unlock.c: New file:
+	implementaion of unlock for lock elision for powerpc.
+	* sysdeps/unix/sysv/linux/powerpc/force-elision.h: New file:
+	automatic enable lock elision for mutexes.
+	* sysdeps/unix/sysv/linux/powerpc/htm.h: New file: hardware
+	transaction execution definitions for powerpc.
+	* sysdeps/unix/sysv/linux/powerpc/lowlevellock.h: New file: add TLE
+	definitions.
+	* sysdeps/unix/sysv/linux/powerpc/pthread_mutex_cond_lock.c: New file.
+	* sysdeps/unix/sysv/linux/powerpc/pthread_mutex_lock.c: Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/pthread_mutex_timedlock.c: Likewise.
+	* sysdeps/unix/sysv/linux/powerpc/pthread_mutex_trylock.c: Likewise.
+	* NEWS: Update.
+
+2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power7/memcmp.S (memcmp): Fix performance
+	regression on LE.
+
+	* sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/power8/strncmp.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+	strncmp-power8 object.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strncmp_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strncmp.c (strncmp): Likewise.
+	* NEWS: Update.
+
+2015-01-13  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
+	    Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power7/strcmp.S (strcmp): Optimize
+	trailing byte check.
+
+2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Add strcmp-power8 object.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strcmp_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/strcmp.c (strcmp): Add
+	__strcmp_power8 implementation.
+	* sysdeps/powerpc/powerpc64/power8/strcmp.S: New file.
+	* NEWS: Update.
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Add strncpy-power8 and stpncpy-power8 objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
+	implementations.
+	* sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/stpncpy.c (__stpncpy): Add
+	__stpncpy_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/strncpy.c (strncpy): Add
+	__strncpy_power8 implementation.
+	* sysdeps/powerpc/powerpc64/power8/stpncpy.S: New file.
+	* sysdeps/powerpc/powerpc64/power8/strncpy.S: New file.
+	* NEWS: Update.
+
+	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
+	* sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+	strncat-power8 object.
+	* sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
+	__strcat_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strcat_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c: New file:
+	optimized strcat for power8.
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+	strcpy-power8 and stpcpy-power8 objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strcpy_power8 and __stpcpy_power8
+	implementations.
+	* sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S: New file:
+	multiarch stpcpy implementation for POWER8.
+	* sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S: New file;
+	multiarch strcpy implementation for POWER8.
+	* sysdeps/powerpc/powerpc64/multiarch/strcpy.c (strcpy): Add
+	__strcpy_power8 function.
+	* sysdeps/powerpc/powerpc64/power8/stpcpy.S: New file: optimized
+	stpcpy for POWER8.
+	* sysdeps/powerpc/powerpc64/power8/strcpy.S: New file: optimized
+	strcpy for POWER8.
+	* NEWS: Update.
+
+2014-12-31  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
+	    Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power7/strcpy.S (strcpy): Optimize unaligned
+	path.
+	* benchtests/bench-strcpy.c (test_main): Add more unaligned inputs.
+
+2014-12-16  Florian Weimer  <fweimer@redhat.com>
+
+	[BZ #17630]
+	* resolv/nss_dns/dns-network.c (getanswer_r): Iterate over alias
+	names.
+
+2014-12-15  Jeff Law  <law@redhat.com>
+
+	[BZ #16617]
+	* stdio-common/vfprintf.c (vfprintf): Allocate large specs array
+	on the heap.  (CVE-2012-3406)
+	* stdio-common/bug23-2.c, stdio-common/bug23-3.c: New file.
+	* stdio-common/bug23-4.c: New file.  Test case by Joseph Myers.
+	* stdio-common/Makefile (tests): Add bug23-2, bug23-3, bug23-4.
+
+2014-12-02  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Remove strpbrk objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Remove strpbrk implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c: Remove file.
+	* sysdeps/powerpc/powerpc64/multiarch/strpbrk.c: Remove file.
+	* sysdeps/powerpc/powerpc64/power7/strpbrk.S: Remove file.
+	* sysdeps/powerpc/powerpc64/strpbrk.S: New file.
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Remove strcspn objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Remove strcspn implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c: Remove file.
+	* sysdeps/powerpc/powerpc64/multiarch/strcspn.c: Remove file.
+	* sysdeps/powerpc/powerpc64/power7/strcspn.S: Remove file.
+	* sysdeps/powerpc/powerpc64/strcspn.S: New file.
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Remove strspn objetcs.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Remove strspn implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S: Remove file.
+	* sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/strspn.S: Remove file.
+	* sysdeps/powerpc/powerpc64/strspn.S: New file.
+
+2014-12-01  Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/strtok.S: New file.
+	* sysdeps/powerpc/powerpc64/strtok_r.S: New file.
+
+2014-11-26  Adhemerval Zanella  <azanella@linux.ibm.com>
+
+	* csu/tst-atomic.c (do_test): Add atomic_exchange_and_add_{acq,rel}
+	tests.
+	* sysdeps/powerpc/bits/atomic.h
+	(__arch_atomic_exchange_and_add_32_acq): Add definition.
+	(__arch_atomic_exchange_and_add_32_rel): Likewise.
+	(atomic_exchange_and_add_acq): Likewise.
+	(atomic_exchange_and_add_rel): Likewise.
+	* sysdeps/powerpc/powerpc32/bits/atomic.h
+	(__arch_atomic_exchange_and_add_64_acq): Add definition.
+	(__arch_atomic_exchange_and_add_64_rel): Likewise.
+	* sysdeps/powerpc/powerpc64/bits/atomic.h
+	(__arch_atomic_exchange_and_add_64_acq): Add definition.
+	(__arch_atomic_exchange_and_add_64_rel): Likewise.
+
+2014-11-25  Anton Blanchard <anton@samba.org>
+
+	* sysdeps/powerpc/bits/atomic.h
+	(__arch_compare_and_exchange_bool_64_rel): Load from mem.
+
+2014-11-19  Carlos O'Donell  <carlos@redhat.com>
+	    Florian Weimer  <fweimer@redhat.com>
+	    Joseph Myers  <joseph@codesourcery.com>
+	    Adam Conrad  <adconrad@0c3.net>
+	    Andreas Schwab  <schwab@suse.de>
+	    Brooks  <bmoses@google.com>
+
+	[BZ #17625]
+	* wordexp-test.c (__dso_handle): Add prototype.
+	(__register_atfork): Likewise.
+	(__app_register_atfork): New function.
+	(registered_forks): New global.
+	(register_fork): New function.
+	(test_case): Add 3 new tests for WRDE_CMDSUB.
+	(main): Call __app_register_atfork.
+	(testit): If WRDE_NOCMD set registered_forks to zero, run test, and if
+	fork count is non-zero fail the test.
+	* posix/wordexp.c (exec_comm): Return WRDE_CMDSUB if WRDE_NOCMD flag
+	is set.
+	(parse_dollars): Remove check for WRDE_NOCMD.
+	(parse_dquote): Likewise.
+
+2014-11-05  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Simplify
+	definition.
+	* sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S (MFVSRD_R3_V1):
+	Likwise.
+	* sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S (MFVSRD_R3_V1):
+	Likewise.
+	* sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S (MFVSRD_R3_V1):
+	Likewise.
+	* sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S (MFVSRD_R3_V1):
+	Likewise.
+	* sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S (MFVSRD_R3_V1):
+	Likewise.
+
+2014-11-03  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Encode
+	mtvsrd instruction in binary form.
+
+2014-10-31  Torvald Riegel  <triegel@redhat.com>
+
+	* sysdeps/powerpc/bits/atomic.h (atomic_write_barrier): Remove and...
+	* sysdeps/powerpc/powerpc32/bits/atomic.h (atomic_write_barrier):
+	... add here and use lwsync or sync ...
+	* sysdeps/powerpc/powerpc64/bits/atomic.h (atomic_write_barrier):
+	... and add here using lwsync.
+
+2014-09-10  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* benchtests/bench-memset.c (test_main): Add more test from size
+	from 32 to 512 bytes.
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Add POWER8 memset object.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add POWER8 memset and bzero implementations.
+	* sysdeps/powerpc/powerpc64/multiarch/bzero.c (__bzero): Add POWER8
+	implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/memset.c (__libc_memset):
+	Likewise.
+	* sysdeps/powerpc/powerpc64/multiarch/memset-power8.S: New file:
+	multiarch POWER8 memset optimization.
+	* sysdeps/powerpc/powerpc64/power8/memset.S: New file: optimized
+	POWER8 memset optimization.
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Remove bzero multiarch objects.
+	* sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S: Remove file.
+	* sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S: Likewise.
+	* sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S: Likewise.
+	* sysdeps/powerpc/powerpc64/multiarch/memset-power4.S [NO_BZERO_IMPL]:
+	Remove define.
+	[__bzero]: Redefine to specific name.
+	* sysdeps/powerpc/powerpc64/multiarch/memset-power6.S: Likewise.
+	* sysdeps/powerpc/powerpc64/multiarch/memset-power7.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power4/memset.S [NO_BZERO_IMPL]: Remove
+	define.
+	* sysdeps/powerpc/powerpc64/power6/memset.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/memset.S: Likewise.
+
 2015-02-16  Paul Pluzhnikov  <ppluzhnikov@google.com>
 
 	[BZ #16618]
diff --git a/NEWS b/NEWS
index 976f2ea023..4d94c08377 100644
--- a/NEWS
+++ b/NEWS
@@ -12,6 +12,17 @@ Version 2.20.1
   16009, 16617, 16618, 17266, 17370, 17371, 17460, 17485, 17555, 17625,
   17630, 17801.
 
+* Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
+  and powerpc64le.  This may improve lock scaling of existing programs on
+  HTM capable systems.  The lock elision code is only enabled with
+  --enable-lock-elision=yes.  Also, the TSX lock elision implementation for
+  powerpc will issue a transaction abort on every syscall to avoid side
+  effects being visible outside transactions.
+
+* Optimized strcpy, stpcpy, strncpy, stpncpy, strcmp, and strncmp
+  implementations for powerpc64/powerpc64le.
+  Implemented by Adhemerval Zanella (IBM).
+
 * CVE-2015-1472 Under certain conditions wscanf can allocate too little
   memory for the to-be-scanned arguments and overflow the allocated
   buffer.  The implementation now correctly computes the required buffer
diff --git a/benchtests/bench-memset.c b/benchtests/bench-memset.c
index 5304113e3d..20265936b9 100644
--- a/benchtests/bench-memset.c
+++ b/benchtests/bench-memset.c
@@ -150,6 +150,11 @@ test_main (void)
 	  if (i & (i - 1))
 	    do_test (0, c, i);
 	}
+      for (i = 32; i < 512; i+=32)
+	{
+	  do_test (0, c, i);
+	  do_test (i, c, i);
+	}
       do_test (1, c, 14);
       do_test (3, c, 1024);
       do_test (4, c, 64);
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index c3ab4cfcf7..e9445f290f 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -171,6 +171,22 @@ test_main (void)
       do_test (i, i, 8 << i, BIG_CHAR);
     }
 
+  for (i = 16; i <= 512; i+=4)
+    {
+      do_test (0, 4, i, SMALL_CHAR);
+      do_test (4, 0, i, BIG_CHAR);
+      do_test (4, 4, i, SMALL_CHAR);
+      do_test (2, 2, i, BIG_CHAR);
+      do_test (2, 6, i, SMALL_CHAR);
+      do_test (6, 2, i, BIG_CHAR);
+      do_test (1, 7, i, SMALL_CHAR);
+      do_test (7, 1, i, BIG_CHAR);
+      do_test (3, 4, i, SMALL_CHAR);
+      do_test (4, 3, i, BIG_CHAR);
+      do_test (5, 7, i, SMALL_CHAR);
+      do_test (7, 5, i, SMALL_CHAR);
+    }
+
   return ret;
 }
 
diff --git a/csu/tst-atomic.c b/csu/tst-atomic.c
index d16c66dc31..ab6db45307 100644
--- a/csu/tst-atomic.c
+++ b/csu/tst-atomic.c
@@ -113,6 +113,22 @@ do_test (void)
       ret = 1;
     }
 
+  mem = 2;
+  if (atomic_exchange_and_add_acq (&mem, 11) != 2
+      || mem != 13)
+    {
+      puts ("atomic_exchange_and_add test failed");
+      ret = 1;
+    }
+
+  mem = 2;
+  if (atomic_exchange_and_add_rel (&mem, 11) != 2
+      || mem != 13)
+    {
+      puts ("atomic_exchange_and_add test failed");
+      ret = 1;
+    }
+
   mem = -21;
   atomic_add (&mem, 22);
   if (mem != 1)
diff --git a/elf/get-dynamic-info.h b/elf/get-dynamic-info.h
index 20ccf30b2a..7f51d90dbc 100644
--- a/elf/get-dynamic-info.h
+++ b/elf/get-dynamic-info.h
@@ -130,8 +130,8 @@ elf_get_dynamic_info (struct link_map *l, ElfW(Dyn) *temp)
   assert (info[DT_FLAGS] == NULL
 	  || (info[DT_FLAGS]->d_un.d_val & ~DF_BIND_NOW) == 0);
   /* Flags must not be set for ld.so.  */
-  assert (info[DT_RUNPATH] == NULL);
-  assert (info[DT_RPATH] == NULL);
+  info[DT_RUNPATH] == NULL;
+  info[DT_RPATH] == NULL;
 #else
   if (info[DT_FLAGS] != NULL)
     {
diff --git a/localedata/locales/bo_CN b/localedata/locales/bo_CN
index d813c103ae..c573d3fe42 100644
--- a/localedata/locales/bo_CN
+++ b/localedata/locales/bo_CN
@@ -145,8 +145,7 @@ END LC_MEASUREMENT
 
 LC_NAME
 % FIXME
-
-name_fmt  ""
+name_fmt	"FIXME"
 % name_gen	"FIXME"
 % name_miss	"FIXME"
 % name_mr	"FIXME"
diff --git a/localedata/locales/bo_IN b/localedata/locales/bo_IN
index 8ab793c833..a1a62808fb 100644
--- a/localedata/locales/bo_IN
+++ b/localedata/locales/bo_IN
@@ -71,7 +71,7 @@ END LC_MEASUREMENT
 
 LC_NAME
 % FIXME
-name_fmt	""
+name_fmt	"FIXME"
 % name_gen	"FIXME"
 % name_miss	"FIXME"
 % name_mr	"FIXME"
diff --git a/sysdeps/ieee754/dbl-64/Makefile b/sysdeps/ieee754/dbl-64/Makefile
index 35f545ff8e..5557c75b45 100644
--- a/sysdeps/ieee754/dbl-64/Makefile
+++ b/sysdeps/ieee754/dbl-64/Makefile
@@ -2,4 +2,5 @@ ifeq ($(subdir),math)
 # branred depends on precise IEEE double rounding
 CFLAGS-branred.c = $(config-cflags-nofma)
 CFLAGS-e_sqrt.c = $(config-cflags-nofma)
+CFLAGS-e_pow.c = $(config-cflags-nofma)
 endif
diff --git a/sysdeps/powerpc/bits/atomic.h b/sysdeps/powerpc/bits/atomic.h
index 2ffba48d55..b05b0f7aa0 100644
--- a/sysdeps/powerpc/bits/atomic.h
+++ b/sysdeps/powerpc/bits/atomic.h
@@ -77,7 +77,6 @@ typedef uintmax_t uatomic_max_t;
 #endif
 
 #define atomic_full_barrier()	__asm ("sync" ::: "memory")
-#define atomic_write_barrier()	__asm ("eieio" ::: "memory")
 
 #define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval)	      \
   ({									      \
@@ -153,6 +152,34 @@ typedef uintmax_t uatomic_max_t;
     __val;								      \
   })
 
+#define __arch_atomic_exchange_and_add_32_acq(mem, value) \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile ("1:	lwarx	%0,0,%3" MUTEX_HINT_ACQ "\n"	      \
+		      "		add	%1,%0,%4\n"			      \
+		      "		stwcx.	%1,0,%3\n"			      \
+		      "		bne-	1b\n"				      \
+		      __ARCH_ACQ_INSTR					      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
+#define __arch_atomic_exchange_and_add_32_rel(mem, value) \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile (__ARCH_REL_INSTR "\n"				      \
+		      "1:	lwarx	%0,0,%3" MUTEX_HINT_REL "\n"	      \
+		      "		add	%1,%0,%4\n"			      \
+		      "		stwcx.	%1,0,%3\n"			      \
+		      "		bne-	1b"				      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
 #define __arch_atomic_increment_val_32(mem) \
   ({									      \
     __typeof (*(mem)) __val;						      \
@@ -253,6 +280,28 @@ typedef uintmax_t uatomic_max_t;
        abort ();							      \
     __result;								      \
   })
+#define atomic_exchange_and_add_acq(mem, value) \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (sizeof (*mem) == 4)						      \
+      __result = __arch_atomic_exchange_and_add_32_acq (mem, value);	      \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_atomic_exchange_and_add_64_acq (mem, value);	      \
+    else 								      \
+       abort ();							      \
+    __result;								      \
+  })
+#define atomic_exchange_and_add_rel(mem, value) \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (sizeof (*mem) == 4)						      \
+      __result = __arch_atomic_exchange_and_add_32_rel (mem, value);	      \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_atomic_exchange_and_add_64_rel (mem, value);	      \
+    else 								      \
+       abort ();							      \
+    __result;								      \
+  })
 
 #define atomic_increment_val(mem) \
   ({									      \
diff --git a/sysdeps/powerpc/nptl/elide.h b/sysdeps/powerpc/nptl/elide.h
new file mode 100644
index 0000000000..01572d99ce
--- /dev/null
+++ b/sysdeps/powerpc/nptl/elide.h
@@ -0,0 +1,111 @@
+/* elide.h: Generic lock elision support for powerpc.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef ELIDE_PPC_H
+# define ELIDE_PPC_H
+
+#ifdef ENABLE_LOCK_ELISION
+# include <htm.h>
+# include <elision-conf.h>
+
+/* Returns true if the lock defined by is_lock_free as elided.
+   ADAPT_COUNT is a pointer to per-lock state variable. */
+
+static inline bool
+__elide_lock (uint8_t *adapt_count, int is_lock_free)
+{
+  if (*adapt_count > 0)
+    {
+      (*adapt_count)--;
+      return false;
+    }
+
+  for (int i = __elision_aconf.try_tbegin; i > 0; i--)
+    {
+      if (__builtin_tbegin (0))
+	{
+	  if (is_lock_free)
+	    return true;
+	  /* Lock was busy.  */
+	  __builtin_tabort (_ABORT_LOCK_BUSY);
+	}
+      else
+	{
+	  /* A persistent failure indicates that a retry will probably
+	     result in another failure.  Use normal locking now and
+	     for the next couple of calls.  */
+	  if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
+	    {
+	      if (__elision_aconf.skip_lock_internal_abort > 0)
+		*adapt_count = __elision_aconf.skip_lock_internal_abort;
+	      break;
+	    }
+	  /* Same logic as above, but for a number of temporary failures in a
+	     a row.  */
+	  else if (__elision_aconf.skip_lock_out_of_tbegin_retries > 0
+		   && __elision_aconf.try_tbegin > 0)
+	    *adapt_count = __elision_aconf.skip_lock_out_of_tbegin_retries;
+	}
+     }
+
+  return false;
+}
+
+# define ELIDE_LOCK(adapt_count, is_lock_free) \
+  __elide_lock (&(adapt_count), is_lock_free)
+
+
+static inline bool
+__elide_trylock (uint8_t *adapt_count, int is_lock_free, int write)
+{
+  if (__elision_aconf.try_tbegin > 0)
+    {
+      if (write)
+	__builtin_tabort (_ABORT_NESTED_TRYLOCK);
+      return __elide_lock (adapt_count, is_lock_free);
+    }
+  return false;
+}
+
+# define ELIDE_TRYLOCK(adapt_count, is_lock_free, write)	\
+  __elide_trylock (&(adapt_count), is_lock_free, write)
+
+
+static inline bool
+__elide_unlock (int is_lock_free)
+{
+  if (is_lock_free)
+    {
+      __builtin_tend (0);
+      return true;
+    }
+  return false;
+}
+
+# define ELIDE_UNLOCK(is_lock_free) \
+  __elide_unlock (is_lock_free)
+
+# else
+
+# define ELIDE_LOCK(adapt_count, is_lock_free) 0
+# define ELIDE_TRYLOCK(adapt_count, is_lock_free, write) 0
+# define ELIDE_UNLOCK(is_lock_free) 0
+
+#endif /* ENABLE_LOCK_ELISION  */
+
+#endif
diff --git a/sysdeps/powerpc/nptl/tcb-offsets.sym b/sysdeps/powerpc/nptl/tcb-offsets.sym
index f996759027..d955142aff 100644
--- a/sysdeps/powerpc/nptl/tcb-offsets.sym
+++ b/sysdeps/powerpc/nptl/tcb-offsets.sym
@@ -19,6 +19,7 @@ POINTER_GUARD			(offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (
 TAR_SAVE			(offsetof (tcbhead_t, tar_save) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
 DSO_SLOT1			(offsetof (tcbhead_t, dso_slot1) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
 DSO_SLOT2			(offsetof (tcbhead_t, dso_slot2) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
+TM_CAPABLE			(offsetof (tcbhead_t, tm_capable) - TLS_TCB_OFFSET - sizeof (tcbhead_t))
 #ifndef __ASSUME_PRIVATE_FUTEX
 PRIVATE_FUTEX_OFFSET		thread_offsetof (header.private_futex)
 #endif
diff --git a/sysdeps/powerpc/nptl/tls.h b/sysdeps/powerpc/nptl/tls.h
index b80a5fbf54..9877b73ba1 100644
--- a/sysdeps/powerpc/nptl/tls.h
+++ b/sysdeps/powerpc/nptl/tls.h
@@ -63,6 +63,8 @@ typedef union dtv
    are private.  */
 typedef struct
 {
+  /* Indicate if HTM capable (ISA 2.07).  */
+  int tm_capable;
   /* Reservation for Dynamic System Optimizer ABI.  */
   uintptr_t dso_slot2;
   uintptr_t dso_slot1;
@@ -130,11 +132,17 @@ register void *__thread_register __asm__ ("r13");
    special attention since 'errno' is not yet available and if the
    operation can cause a failure 'errno' must not be touched.  */
 # define TLS_INIT_TP(tcbp) \
-    (__thread_register = (void *) (tcbp) + TLS_TCB_OFFSET, NULL)
+  ({ 									      \
+    __thread_register = (void *) (tcbp) + TLS_TCB_OFFSET;		      \
+    THREAD_SET_TM_CAPABLE (GLRO (dl_hwcap2) & PPC_FEATURE2_HAS_HTM ? 1 : 0);  \
+    NULL;								      \
+  })
 
 /* Value passed to 'clone' for initialization of the thread register.  */
 # define TLS_DEFINE_INIT_TP(tp, pd) \
-  void *tp = (void *) (pd) + TLS_TCB_OFFSET + TLS_PRE_TCB_SIZE
+    void *tp = (void *) (pd) + TLS_TCB_OFFSET + TLS_PRE_TCB_SIZE;	      \
+    (((tcbhead_t *) ((char *) tp - TLS_TCB_OFFSET))[-1].tm_capable) =	      \
+      THREAD_GET_TM_CAPABLE ();
 
 /* Return the address of the dtv for the current thread.  */
 # define THREAD_DTV() \
@@ -188,6 +196,13 @@ register void *__thread_register __asm__ ("r13");
 		     + TLS_PRE_TCB_SIZE))[-1].pointer_guard		      \
      = THREAD_GET_POINTER_GUARD())
 
+/* tm_capable field in TCB head.  */
+# define THREAD_GET_TM_CAPABLE() \
+    (((tcbhead_t *) ((char *) __thread_register				      \
+		     - TLS_TCB_OFFSET))[-1].tm_capable)
+# define THREAD_SET_TM_CAPABLE(value) \
+    (THREAD_GET_TM_CAPABLE () = (value))
+
 /* l_tls_offset == 0 is perfectly valid on PPC, so we have to use some
    different value to mean unset l_tls_offset.  */
 # define NO_TLS_OFFSET		-1
diff --git a/sysdeps/powerpc/powerpc32/bits/atomic.h b/sysdeps/powerpc/powerpc32/bits/atomic.h
index 7613bdc485..7422262dc1 100644
--- a/sysdeps/powerpc/powerpc32/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc32/bits/atomic.h
@@ -95,6 +95,12 @@
 #define __arch_atomic_exchange_and_add_64(mem, value) \
     ({ abort (); (*mem) = (value); })
 
+#define __arch_atomic_exchange_and_add_64_acq(mem, value) \
+    ({ abort (); (*mem) = (value); })
+
+#define __arch_atomic_exchange_and_add_64_rel(mem, value) \
+    ({ abort (); (*mem) = (value); })
+
 #define __arch_atomic_increment_val_64(mem) \
     ({ abort (); (*mem)++; })
 
@@ -117,6 +123,7 @@
 # ifndef UP
 #  define __ARCH_REL_INSTR	"lwsync"
 # endif
+# define atomic_write_barrier()	__asm ("lwsync" ::: "memory")
 #else
 /*
  * Older powerpc32 processors don't support the new "light weight"
@@ -124,6 +131,7 @@
  * for all powerpc32 applications.
  */
 # define atomic_read_barrier()	__asm ("sync" ::: "memory")
+# define atomic_write_barrier()	__asm ("sync" ::: "memory")
 #endif
 
 /*
diff --git a/sysdeps/powerpc/powerpc32/sysdep.h b/sysdeps/powerpc/powerpc32/sysdep.h
index c8a56aadbf..c4b3ca8696 100644
--- a/sysdeps/powerpc/powerpc32/sysdep.h
+++ b/sysdeps/powerpc/powerpc32/sysdep.h
@@ -88,7 +88,23 @@ GOT_LABEL:			;					      \
   cfi_endproc;								      \
   ASM_SIZE_DIRECTIVE(name)
 
+#if !defined IS_IN_rtld && defined (ENABLE_LOCK_ELISION)
+# define ABORT_TRANSACTION \
+    cmpwi    2,0;		\
+    beq      1f;		\
+    lwz      0,TM_CAPABLE(2);	\
+    cmpwi    0,0;		\
+    beq	     1f;		\
+    li	     0,_ABORT_SYSCALL;	\
+    tabort.  0;			\
+    .align 4;			\
+1:
+#else
+# define ABORT_TRANSACTION
+#endif
+
 #define DO_CALL(syscall)						      \
+    ABORT_TRANSACTION							      \
     li 0,syscall;							      \
     sc
 
diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h
index 527fe7c133..e64cb9fa54 100644
--- a/sysdeps/powerpc/powerpc64/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc64/bits/atomic.h
@@ -97,7 +97,7 @@
 ({									      \
   unsigned long	__tmp;							      \
   __asm __volatile (__ARCH_REL_INSTR "\n"				      \
-		    "1:	ldarx	%0,0,%2" MUTEX_HINT_REL "\n"		      \
+		    "1:	ldarx	%0,0,%1" MUTEX_HINT_REL "\n"		      \
 		    "	subf.	%0,%2,%0\n"				      \
 		    "	bne	2f\n"					      \
 		    "	stdcx.	%3,0,%1\n"				      \
@@ -183,6 +183,34 @@
       __val;								      \
     })
 
+#define __arch_atomic_exchange_and_add_64_acq(mem, value) \
+    ({									      \
+      __typeof (*mem) __val, __tmp;					      \
+      __asm __volatile ("1:	ldarx	%0,0,%3" MUTEX_HINT_ACQ "\n"	      \
+			"	add	%1,%0,%4\n"			      \
+			"	stdcx.	%1,0,%3\n"			      \
+			"	bne-	1b\n"				      \
+			__ARCH_ACQ_INSTR				      \
+			: "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+			: "b" (mem), "r" (value), "m" (*mem)		      \
+			: "cr0", "memory");				      \
+      __val;								      \
+    })
+
+#define __arch_atomic_exchange_and_add_64_rel(mem, value) \
+    ({									      \
+      __typeof (*mem) __val, __tmp;					      \
+      __asm __volatile (__ARCH_REL_INSTR "\n"				      \
+			"1:	ldarx	%0,0,%3" MUTEX_HINT_REL "\n"	      \
+			"	add	%1,%0,%4\n"			      \
+			"	stdcx.	%1,0,%3\n"			      \
+			"	bne-	1b"				      \
+			: "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+			: "b" (mem), "r" (value), "m" (*mem)		      \
+			: "cr0", "memory");				      \
+      __val;								      \
+    })
+
 #define __arch_atomic_increment_val_64(mem) \
     ({									      \
       __typeof (*(mem)) __val;						      \
@@ -234,6 +262,7 @@
 #ifndef UP
 # define __ARCH_REL_INSTR	"lwsync"
 #endif
+#define atomic_write_barrier()	__asm ("lwsync" ::: "memory")
 
 /*
  * Include the rest of the atomic ops macros which are common to both
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 82722fb69f..b7ea28420f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -2,24 +2,26 @@ ifeq ($(subdir),string)
 sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
 		   memcmp-ppc64 memset-power7 memset-power6 memset-power4 \
-		   memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \
+		   memset-ppc64 memset-power8 \
 		   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
 		   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
 		   rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
 		   strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \
-		   strncase-power7 strncase_l-power7 strncmp-power7 \
-		   strncmp-power4 strncmp-ppc64 strchr-power7 strchr-ppc64 \
+		   strncase-power7 strncase_l-power7 \
+		   strncmp-power8 strncmp-power7 strncmp-power4 strncmp-ppc64 \
+		   strchr-power7 strchr-ppc64 \
 		   strchrnul-power7 strchrnul-ppc64 wcschr-power7 \
 		   wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \
 		   wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \
 		   wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
-		   strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
+		   strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
+		   stpcpy-power7 stpcpy-ppc64 \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
-		   strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \
-		   strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
-		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
-		   strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
-		   bcopy-ppc64
+		   strncpy-power7 strncpy-ppc64 \
+		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
+		   strcmp-power8 strcmp-power7 strcmp-ppc64 \
+		   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
+		   memmove-ppc64 bcopy-ppc64 strncpy-power8
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
index ed83541fa5..298cf005a1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -26,14 +26,17 @@ extern __typeof (bzero) __bzero_ppc attribute_hidden;
 extern __typeof (bzero) __bzero_power4 attribute_hidden;
 extern __typeof (bzero) __bzero_power6 attribute_hidden;
 extern __typeof (bzero) __bzero_power7 attribute_hidden;
+extern __typeof (bzero) __bzero_power8 attribute_hidden;
 
 libc_ifunc (__bzero,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __bzero_power7 :
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __bzero_power8 :
+	      (hwcap & PPC_FEATURE_HAS_VSX)
+	      ? __bzero_power7 :
+		(hwcap & PPC_FEATURE_ARCH_2_05)
 		? __bzero_power6 :
 		  (hwcap & PPC_FEATURE_POWER4)
-		? __bzero_power4
+		  ? __bzero_power4
             : __bzero_ppc);
 
 weak_alias (__bzero, bzero)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index a574487f2f..bd92cf6faa 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -34,6 +34,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   size_t i = 0;
 
   unsigned long int hwcap = GLRO(dl_hwcap);
+  unsigned long int hwcap2 = GLRO(dl_hwcap2);
+
   /* hwcap contains only the latest supported ISA, the code checks which is
      and fills the previous supported ones.  */
   if (hwcap & PPC_FEATURE_ARCH_2_06)
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c.  */
   IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __memset_power8)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
 			      __memset_power7)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
@@ -79,6 +83,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
   IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strcpy_power8)
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1,
@@ -86,6 +92,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
+	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __stpcpy_power8)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __stpcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1,
@@ -100,6 +108,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c.  */
   IFUNC_IMPL (i, name, strncmp,
+	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strncmp_power8)
 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_HAS_VSX,
 			      __strncmp_power7)
 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_POWER4,
@@ -134,6 +144,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c.  */
   IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __bzero_power8)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
 			      __bzero_power7)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
@@ -266,33 +278,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_ppc))
 
-  /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c.  */
-  IFUNC_IMPL (i, name, strspn,
-	      IFUNC_IMPL_ADD (array, i, strspn,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-			      __strspn_power7)
-	      IFUNC_IMPL_ADD (array, i, strspn, 1,
-			      __strspn_ppc))
-
-  /* Support sysdeps/powerpc/powerpc64/multiarch/strcspn.c.  */
-  IFUNC_IMPL (i, name, strcspn,
-	      IFUNC_IMPL_ADD (array, i, strcspn,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-			      __strcspn_power7)
-	      IFUNC_IMPL_ADD (array, i, strcspn, 1,
-			     __strcspn_ppc))
-
-  /* Support sysdeps/powerpc/powerpc64/multiarch/strpbrk.c.  */
-  IFUNC_IMPL (i, name, strpbrk,
-	      IFUNC_IMPL_ADD (array, i, strpbrk,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-			      __strpbrk_power7)
-	      IFUNC_IMPL_ADD (array, i, strpbrk, 1,
-			     __strpbrk_ppc))
-
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
 	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strncpy_power8)
+	      IFUNC_IMPL_ADD (array, i, strncpy,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strncpy_power7)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
@@ -301,6 +292,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __stpncpy_power8)
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __stpncpy_power7)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
@@ -309,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c.  */
   IFUNC_IMPL (i, name, strcmp,
 	      IFUNC_IMPL_ADD (array, i, strcmp,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strcmp_power8)
+	      IFUNC_IMPL_ADD (array, i, strcmp,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcmp_power7)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1,
@@ -317,6 +314,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c.  */
   IFUNC_IMPL (i, name, strcat,
 	      IFUNC_IMPL_ADD (array, i, strcat,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strcat_power8)
+	      IFUNC_IMPL_ADD (array, i, strcat,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcat_power7)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S
index 968dc24bd3..1291fb7339 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S
@@ -37,5 +37,7 @@
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#define NO_BZERO_IMPL
+#undef __bzero
+#define __bzero __bzero_power4
+
 #include <sysdeps/powerpc/powerpc64/power4/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S
index 65519b91f1..3dc199c535 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S
@@ -37,5 +37,7 @@
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#define NO_BZERO_IMPL
+#undef __bzero
+#define __bzero __bzero_power6
+
 #include <sysdeps/powerpc/powerpc64/power6/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S
index 86765e74ab..fb1a3423ee 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S
@@ -37,5 +37,6 @@
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#define NO_BZERO_IMPL
+#undef __bzero
+#define __bzero __bzero_power7
 #include <sysdeps/powerpc/powerpc64/power7/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
new file mode 100644
index 0000000000..e8a604b000
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
@@ -0,0 +1,43 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__memset_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__memset_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__memset_power8)
+
+#undef END_GEN_TB
+#define END_GEN_TB(name, mask)					\
+  cfi_endproc;							\
+  TRACEBACK_MASK(__memset_power8,mask)				\
+  END_2(__memset_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bzero
+#define __bzero __bzero_power8
+
+#include <sysdeps/powerpc/powerpc64/power8/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index aa2ae7056e..9c7ed10c87 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -32,16 +32,19 @@ extern __typeof (__redirect_memset) __memset_ppc attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (__libc_memset,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __memset_power7 :
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __memset_power8 :
+	      (hwcap & PPC_FEATURE_HAS_VSX)
+	      ? __memset_power7 :
+		(hwcap & PPC_FEATURE_ARCH_2_05)
 		? __memset_power6 :
 		  (hwcap & PPC_FEATURE_POWER4)
-		? __memset_power4
+		  ? __memset_power4
             : __memset_ppc);
 
 #undef memset
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
index 889dfeea8e..66e6f708bd 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
@@ -1,5 +1,5 @@
-/* Optimized strspn implementation for POWER7.
-   Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized stpcpy implementation for POWER8/PPC64.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -21,20 +21,20 @@
 #undef EALIGN
 #define EALIGN(name, alignt, words)				\
   .section ".text";						\
-  ENTRY_2(__strspn_power7)					\
+  ENTRY_2(__stpcpy_power8)					\
   .align ALIGNARG(alignt);					\
   EALIGN_W_##words;						\
-  BODY_LABEL(__strspn_power7):					\
+  BODY_LABEL(__stpcpy_power8):					\
   cfi_startproc;						\
-  LOCALENTRY(__strspn_power7)
+  LOCALENTRY(__stpcpy_power8)
 
 #undef END
 #define END(name)						\
   cfi_endproc;							\
-  TRACEBACK(__strspn_power7)					\
-  END_2(__strspn_power7)
+  TRACEBACK(__stpcpy_power8)					\
+  END_2(__stpcpy_power8)
 
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#include <sysdeps/powerpc/powerpc64/power7/strspn.S>
+#include <sysdeps/powerpc/powerpc64/power8/stpcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
new file mode 100644
index 0000000000..d5d835de91
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
@@ -0,0 +1,39 @@
+/* Optimized stpncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define USE_AS_STPNCPY
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__stpncpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__stpncpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__stpncpy_power8)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__stpncpy_power8)					\
+  END_2(__stpncpy_power8)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index dbf85214a2..3ee50e527c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -23,10 +23,13 @@
 
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
 
 libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __stpncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __stpncpy_power7
             : __stpncpy_ppc);
 
 weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
index 8dea70edc1..6c7544c959 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014 Free Software Foundation, Inc.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -13,18 +13,18 @@
 
    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
+   <http://www.gnu.org/licenses/ >.  */
 
 #include <string.h>
 
-#define STRPBRK __strpbrk_ppc
-#ifdef SHARED
+#define STRCAT __strcat_power8
 
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__strpbrk_ppc, __GI_strpbrk, __strpbrk_ppc);
-#endif
+#undef libc_hidden_def
+#define libc_hidden_def(name)
 
-extern __typeof (strpbrk) __strpbrk_ppc attribute_hidden;
+extern typeof (strcpy) __strcpy_power8;
+extern typeof (strlen) __strlen_power7;
 
-#include <string/strpbrk.c>
+#define strcpy __strcpy_power8
+#define strlen __strlen_power7
+#include <sysdeps/powerpc/strcat.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
index 847a62de52..289e9b2365 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
@@ -23,9 +23,12 @@
 
 extern __typeof (strcat) __strcat_ppc attribute_hidden;
 extern __typeof (strcat) __strcat_power7 attribute_hidden;
+extern __typeof (strcat) __strcat_power8 attribute_hidden;
 
 libc_ifunc (strcat,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcat_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strcat_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strcat_power7
             : __strcat_ppc);
 #endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
index 663ca36568..dc4bfac9ee 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
@@ -1,5 +1,5 @@
-/* Optimized strpbrk implementation for POWER7.
-   Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized strcmp implementation for POWER8/PPC64.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -21,20 +21,20 @@
 #undef EALIGN
 #define EALIGN(name, alignt, words)				\
   .section ".text";						\
-  ENTRY_2(__strpbrk_power7)					\
+  ENTRY_2(__strcmp_power8)					\
   .align ALIGNARG(alignt);					\
   EALIGN_W_##words;						\
-  BODY_LABEL(__strpbrk_power7):					\
+  BODY_LABEL(__strcmp_power8):					\
   cfi_startproc;						\
-  LOCALENTRY(__strpbrk_power7)
+  LOCALENTRY(__strcmp_power8)
 
 #undef END
 #define END(name)						\
   cfi_endproc;							\
-  TRACEBACK(__strpbrk_power7)					\
-  END_2(__strpbrk_power7)
+  TRACEBACK(__strcmp_power8)					\
+  END_2(__strcmp_power8)
 
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#include <sysdeps/powerpc/powerpc64/power7/strpbrk.S>
+#include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index 2013301aa1..c711969992 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -23,9 +23,12 @@
 
 extern __typeof (strcmp) __strcmp_ppc attribute_hidden;
 extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
+extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
 
 libc_ifunc (strcmp,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcmp_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+              ? __strcmp_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strcmp_power7
             : __strcmp_ppc);
 #endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
index 02ffcc89b8..64cbc163a4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
@@ -1,5 +1,5 @@
-/* Optimized strcspn implementation for POWER7.
-   Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized strcpy implementation for POWER8/PPC64.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -21,20 +21,20 @@
 #undef EALIGN
 #define EALIGN(name, alignt, words)				\
   .section ".text";						\
-  ENTRY_2(__strcspn_power7)					\
+  ENTRY_2(__strcpy_power8)					\
   .align ALIGNARG(alignt);					\
   EALIGN_W_##words;						\
-  BODY_LABEL(__strcspn_power7):					\
+  BODY_LABEL(__strcpy_power8):					\
   cfi_startproc;						\
-  LOCALENTRY(__strcspn_power7)
+  LOCALENTRY(__strcpy_power8)
 
 #undef END
 #define END(name)						\
   cfi_endproc;							\
-  TRACEBACK(__strcspn_power7)					\
-  END_2(__strcspn_power7)
+  TRACEBACK(__strcpy_power8)					\
+  END_2(__strcpy_power8)
 
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#include <sysdeps/powerpc/powerpc64/power7/strcspn.S>
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
index 1b6e9e0665..20ef73f7d5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
@@ -23,9 +23,12 @@
 
 extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
 extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
 
 libc_ifunc (strcpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strcpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strcpy_power7
             : __strcpy_ppc);
 #endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
index 5f8b61054d..39b1aebe9b 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014 Free Software Foundation, Inc.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -13,18 +13,19 @@
 
    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
+   <http://www.gnu.org/licenses/ >.  */
 
 #include <string.h>
 
-#define STRCSPN __strcspn_ppc
-#ifdef SHARED
+#define STRNCAT __strncat_power7
 
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__strcspn_ppc, __GI_strcspn, __strcspn_ppc);
-#endif
+extern __typeof (strncat) __strncat_power7 attribute_hidden;
+extern __typeof (strlen) __strlen_power7 attribute_hidden;
+extern __typeof (strnlen) __strnlen_power7 attribute_hidden;
+extern __typeof (memcpy) __memcpy_power7 attribute_hidden;
 
-extern __typeof (strcspn) __strcspn_ppc attribute_hidden;
+#define strlen    __strlen_power7
+#define __strnlen __strnlen_power7
+#define memcpy    __memcpy_power7
 
-#include <string/strcspn.c>
+#include <string/strncat.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S
new file mode 100644
index 0000000000..8d7223d256
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S
@@ -0,0 +1,40 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name,alignt,words)				\
+  .section ".text";						\
+  ENTRY_2(__strncmp_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strncmp_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strncmp_power8)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strncmp_power8)					\
+  END_2(__strncmp_power8)
+
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
index 9829d69395..5e767839b9 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
@@ -25,13 +25,16 @@
 extern __typeof (strncmp) __strncmp_ppc attribute_hidden;
 extern __typeof (strncmp) __strncmp_power4 attribute_hidden;
 extern __typeof (strncmp) __strncmp_power7 attribute_hidden;
+extern __typeof (strncmp) __strncmp_power8 attribute_hidden;
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (strncmp,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strncmp_power7 :
-	      (hwcap & PPC_FEATURE_POWER4)
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strncmp_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strncmp_power7 :
+		(hwcap & PPC_FEATURE_POWER4)
 		? __strncmp_power4
             : __strncmp_ppc);
 #endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
index ead4a9afbe..ed906a4394 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
@@ -1,5 +1,5 @@
-/* Optimized strncat implementation for POWER7.
-   Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized strncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -21,22 +21,20 @@
 #undef EALIGN
 #define EALIGN(name, alignt, words)				\
   .section ".text";						\
-  ENTRY_2(__strncat_power7)					\
+  ENTRY_2(__strncpy_power8)					\
   .align ALIGNARG(alignt);					\
   EALIGN_W_##words;						\
-  BODY_LABEL(__strncat_power7):					\
+  BODY_LABEL(__strncpy_power8):					\
   cfi_startproc;						\
-  LOCALENTRY(__strncat_power7)
+  LOCALENTRY(__strncpy_power8)
 
 #undef END
 #define END(name)						\
   cfi_endproc;							\
-  TRACEBACK(__strncat_power7)					\
-  END_2(__strncat_power7)
+  TRACEBACK(__strncpy_power8)					\
+  END_2(__strncpy_power8)
 
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#define STRLEN __strlen_power7
-
-#include <sysdeps/powerpc/powerpc64/power7/strncat.S>
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 8fd5e4b0c8..19927bc68c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -24,12 +24,15 @@
 
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc (strncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strncpy_power7
             : __strncpy_ppc);
 
 #endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c
deleted file mode 100644
index d543772a97..0000000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-
-#define STRSPN __strspn_ppc
-#undef weak_alias
-#define weak_alias(name, aliasname) \
-  extern __typeof (__strspn_ppc) aliasname \
-    __attribute__ ((weak, alias ("__strspn_ppc")));
-#if !defined(NOT_IN_libc) && defined(SHARED)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
-  __hidden_ver1(__strspn_ppc, __GI_strspn, __strspn_ppc);
-#endif
-
-extern __typeof (strspn) __strspn_ppc attribute_hidden;
-
-#include <string/strspn.c>
diff --git a/sysdeps/powerpc/powerpc64/power4/memset.S b/sysdeps/powerpc/powerpc64/power4/memset.S
index 3a1e9dc76a..b433d49be8 100644
--- a/sysdeps/powerpc/powerpc64/power4/memset.S
+++ b/sysdeps/powerpc/powerpc64/power4/memset.S
@@ -235,7 +235,6 @@ L(medium_28t):
 END_GEN_TB (memset,TB_TOCLESS)
 libc_hidden_builtin_def (memset)
 
-#ifndef NO_BZERO_IMPL
 /* Copied from bzero.S to prevent the linker from inserting a stub
    between bzero and memset.  */
 ENTRY (__bzero)
@@ -243,7 +242,7 @@ ENTRY (__bzero)
 	mr	r5,r4
 	li	r4,0
 	b	L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
 weak_alias (__bzero, bzero)
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power6/memset.S b/sysdeps/powerpc/powerpc64/power6/memset.S
index b5115a7989..6fffe0ec66 100644
--- a/sysdeps/powerpc/powerpc64/power6/memset.S
+++ b/sysdeps/powerpc/powerpc64/power6/memset.S
@@ -379,7 +379,6 @@ L(medium_28t):
 END_GEN_TB (memset,TB_TOCLESS)
 libc_hidden_builtin_def (memset)
 
-#ifndef NO_BZERO_IMPL
 /* Copied from bzero.S to prevent the linker from inserting a stub
    between bzero and memset.  */
 ENTRY (__bzero)
@@ -387,7 +386,7 @@ ENTRY (__bzero)
 	mr	r5,r4
 	li	r4,0
 	b	L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
 weak_alias (__bzero, bzero)
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
index 09bff696ff..98b9e54ea9 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -26,18 +26,48 @@
 EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rRTN	r3
-#define rSTR1	r3	/* first string arg */
-#define rSTR2	r4	/* second string arg */
-#define rN	r5	/* max string length */
-#define rWORD1	r6	/* current word in s1 */
-#define rWORD2	r7	/* current word in s2 */
-#define rWORD3	r8	/* next word in s1 */
-#define rWORD4	r9	/* next word in s2 */
-#define rWORD5	r10	/* next word in s1 */
-#define rWORD6	r11	/* next word in s2 */
-#define rWORD7	r30	/* next word in s1 */
-#define rWORD8	r31	/* next word in s2 */
+#define rRTN		r3
+#define rSTR1		r3	/* first string arg */
+#define rSTR2		r4	/* second string arg */
+#define rN		r5	/* max string length */
+#define rWORD1		r6	/* current word in s1 */
+#define rWORD2		r7	/* current word in s2 */
+#define rWORD3		r8	/* next word in s1 */
+#define rWORD4		r9	/* next word in s2 */
+#define rWORD5		r10	/* next word in s1 */
+#define rWORD6		r11	/* next word in s2 */
+
+#define rOFF8		r20	/* 8 bytes offset.  */
+#define rOFF16  	r21	/* 16 bytes offset.  */
+#define rOFF24		r22	/* 24 bytes offset.  */
+#define rOFF32		r23	/* 24 bytes offset.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rWORD7		r30	/* next word in s1 */
+#define rWORD8		r31	/* next word in s2 */
+
+#define rWORD8SAVE	(-8)
+#define rWORD7SAVE	(-16)
+#define rOFF8SAVE	(-24)
+#define rOFF16SAVE	(-32)
+#define rOFF24SAVE	(-40)
+#define rOFF32SAVE	(-48)
+#define rSHRSAVE	(-56)
+#define rSHLSAVE	(-64)
+#define rWORD8SHIFTSAVE	(-72)
+#define rWORD2SHIFTSAVE	(-80)
+#define rWORD4SHIFTSAVE	(-88)
+#define rWORD6SHIFTSAVE	(-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD	ldbrx
+#else
+# define LD	ldx
+#endif
 
 	xor	r0, rSTR2, rSTR1
 	cmpldi	cr6, rN, 0
@@ -51,10 +81,24 @@ EALIGN (memcmp, 4, 0)
 /* If less than 8 bytes or not aligned, use the unaligned
    byte loop.  */
 	blt	cr1, L(bytealigned)
-	std	rWORD8, -8(r1)
-	cfi_offset(rWORD8, -8)
-	std	rWORD7, -16(r1)
-	cfi_offset(rWORD7, -16)
+	std	rWORD8, rWORD8SAVE(r1)
+	cfi_offset(rWORD8, rWORD8SAVE)
+	std	rWORD7, rWORD7SAVE(r1)
+	cfi_offset(rWORD7, rWORD7SAVE)
+	std	rOFF8, rOFF8SAVE(r1)
+	cfi_offset(rWORD7, rOFF8SAVE)
+	std	rOFF16, rOFF16SAVE(r1)
+	cfi_offset(rWORD7, rOFF16SAVE)
+	std	rOFF24, rOFF24SAVE(r1)
+	cfi_offset(rWORD7, rOFF24SAVE)
+	std	rOFF32, rOFF32SAVE(r1)
+	cfi_offset(rWORD7, rOFF32SAVE)
+
+	li	rOFF8,8
+	li	rOFF16,16
+	li	rOFF24,24
+	li	rOFF32,32
+
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
    compare length is at least 8 bytes.  r12 contains the low order
@@ -79,15 +123,8 @@ L(samealignment):
 	sldi	rWORD6, r12, 3
 	srdi	r0, rN, 5	/* Divide by 32 */
 	andi.	r12, rN, 24	/* Get the DW remainder */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-	ld	rWORD2, 0(rSTR2)
-#endif
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
 	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
@@ -104,15 +141,8 @@ L(dsP1):
 	cmpld	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 16 */
@@ -123,15 +153,8 @@ L(dPs2):
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 8(rSTR1)
-	ld	rWORD8, 8(rSTR2)
-#endif
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 24 */
@@ -173,72 +196,43 @@ L(dP1):
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 0(rSTR1)
-	ld	rWORD6, 0(rSTR2)
-#endif
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
 	cmpld	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 L(dP1e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5x)
 	bne	cr7, L(dLcr7x)
 
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 32(rSTR1)
-	ldu	rWORD8, 32(rSTR2)
-#endif
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
 	bne	cr1, L(dLcr1)
 	cmpld	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
 	bne	cr6, L(dLcr6)
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	.align	3
 L(dP1x):
 	sldi.	r12, rN, 3
 	bne	cr5, L(dLcr5x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
@@ -246,79 +240,41 @@ L(dP1x):
 	.align	4
 L(dP2):
 	mtctr	r0
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 0(rSTR1)
-	ld	rWORD6, 0(rSTR2)
-#endif
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 8(rSTR1)
-	ld	rWORD8, 8(rSTR2)
-#endif
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 L(dP2e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 16(rSTR1)
-	ld	rWORD2, 16(rSTR2)
-#endif
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 24(rSTR1)
-	ld	rWORD4, 24(rSTR2)
-#endif
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	bne	cr6, L(dLcr6)
 	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
-/* Again we are on a early exit path (16-23 byte compare), we want to
-   only use volatile registers and avoid restoring non-volatile
-   registers.  */
 	.align	4
 L(dP2x):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 8(rSTR1)
-	ld	rWORD4, 8(rSTR2)
-#endif
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	sldi.	r12, rN, 3
 	bne	cr6, L(dLcr6x)
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	bne	cr1, L(dLcr1x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
@@ -326,52 +282,22 @@ L(dP2x):
 	.align	4
 L(dP3):
 	mtctr	r0
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 0(rSTR1)
-	ld	rWORD4, 0(rSTR2)
-#endif
+	LD	rWORD3, 0, rSTR1
+	LD	rWORD4, 0, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 L(dP3e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 8(rSTR1)
-	ld	rWORD6, 8(rSTR2)
-#endif
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP3x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 16(rSTR1)
-	ld	rWORD8, 16(rSTR2)
-#endif
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 24(rSTR1)
-	ld	rWORD2, 24(rSTR2)
-#endif
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
 	bne	cr1, L(dLcr1)
 	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
@@ -380,26 +306,21 @@ L(dP3e):
    registers.  */
 	.align	4
 L(dP3x):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 16(rSTR1)
-	ld	rWORD2, 16(rSTR2)
-#endif
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	sldi.	r12, rN, 3
 	bne	cr1, L(dLcr1x)
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
 	bne	cr6, L(dLcr6x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	cr7, L(dLcr7x)
 	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
@@ -407,46 +328,20 @@ L(dP3x):
 	.align	4
 L(dP4):
 	mtctr	r0
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-	ld	rWORD2, 0(rSTR2)
-#endif
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 L(dP4e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 8(rSTR1)
-	ld	rWORD4, 8(rSTR2)
-#endif
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 16(rSTR1)
-	ld	rWORD6, 16(rSTR2)
-#endif
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 24(rSTR1)
-	ldu	rWORD8, 24(rSTR2)
-#endif
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr7, L(dLcr7)
 	bne	cr1, L(dLcr1)
@@ -454,51 +349,25 @@ L(dP4e):
 /* This is the primary loop */
 	.align	4
 L(dLoop):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(dLcr6)
 L(dLoop1):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5)
 L(dLoop2):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr7, L(dLcr7)
 L(dLoop3):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 32(rSTR1)
-	ldu	rWORD8, 32(rSTR2)
-#endif
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
 	bne	cr1, L(dLcr1)
 	cmpld	cr7, rWORD1, rWORD2
 	bdnz	L(dLoop)
@@ -519,62 +388,75 @@ L(d14):
 	sldi.	r12, rN, 3
 	bne	cr5, L(dLcr5)
 L(d04):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
-	beq	L(zeroLength)
+	beq	L(duzeroLength)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
    we are aligned it is safe to load the whole double word, and use
    shift right double to eliminate bits beyond the compare length.  */
 L(d00):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	srd	rWORD1, rWORD1, rN
 	srd	rWORD2, rWORD2, rN
 	cmpld	cr7, rWORD1, rWORD2
 	bne	cr7, L(dLcr7x)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
 	.align	4
 L(dLcr7):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr7x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr7
 	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr1):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr1x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr1
 	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr6):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr6x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr6
 	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr5):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr5x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr5
 	li	rRTN, -1
@@ -583,10 +465,6 @@ L(dLcr5x):
 	.align	4
 L(bytealigned):
 	mtctr	rN
-#if 0
-/* Huh?  We've already branched on cr6!  */
-	beq	cr6, L(zeroLength)
-#endif
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
    to avoid pipe delays.  The dependent instruction latencies (load to
@@ -685,6 +563,7 @@ L(b11):
 L(bx12):
 	sub	rRTN, rWORD1, rWORD2
 	blr
+
 	.align	4
 L(zeroLength):
 	li	rRTN, 0
@@ -705,42 +584,36 @@ L(zeroLength):
    we need to adjust the length (rN) and special case the loop
    versioning for the first DW. This ensures that the loop count is
    correct and the first DW (shifted) is in the expected resister pair.  */
-#define rSHL		r29	/* Unaligned shift left count.  */
-#define rSHR		r28	/* Unaligned shift right count.  */
-#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
-#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
-#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
-#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
 L(unaligned):
-	std	rSHL, -24(r1)
-	cfi_offset(rSHL, -24)
+	std	rSHL, rSHLSAVE(r1)
+	cfi_offset(rSHL, rSHLSAVE)
 	clrldi	rSHL, rSTR2, 61
 	beq	cr6, L(duzeroLength)
-	std	rSHR, -32(r1)
-	cfi_offset(rSHR, -32)
+	std	rSHR, rSHRSAVE(r1)
+	cfi_offset(rSHR, rSHRSAVE)
 	beq	cr5, L(DWunaligned)
-	std	rWORD8_SHIFT, -40(r1)
-	cfi_offset(rWORD8_SHIFT, -40)
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
 /* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 DW.  */
 	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the DW before that DW that contains
    the actual start of rSTR2.  */
 	clrrdi	rSTR2, rSTR2, 3
-	std	rWORD2_SHIFT, -48(r1)
-	cfi_offset(rWORD2_SHIFT, -48)
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
 /* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (DW aligned) start of rSTR1.  */
 	clrldi	rSHL, rWORD8_SHIFT, 61
 	clrrdi	rSTR1, rSTR1, 3
-	std	rWORD4_SHIFT, -56(r1)
-	cfi_offset(rWORD4_SHIFT, -56)
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
 	sldi	rSHL, rSHL, 3
 	cmpld	cr5, rWORD8_SHIFT, rSTR2
 	add	rN, rN, r12
 	sldi	rWORD6, r12, 3
-	std	rWORD6_SHIFT, -64(r1)
-	cfi_offset(rWORD6_SHIFT, -64)
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
 	subfic	rSHR, rSHL, 64
 	srdi	r0, rN, 5	/* Divide by 32 */
 	andi.	r12, rN, 24	/* Get the DW remainder */
@@ -750,25 +623,13 @@ L(unaligned):
    this may cross a page boundary and cause a page fault.  */
 	li	rWORD8, 0
 	blt	cr5, L(dus0)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD8, 0, rSTR2
+	LD	rWORD8, 0, rSTR2
 	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD8, 0(rSTR2)
-	addi	rSTR2, rSTR2, 8
-#endif
 	sld	rWORD8, rWORD8, rSHL
 
 L(dus0):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-	ld	rWORD2, 0(rSTR2)
-#endif
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
 	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	srd	r12, rWORD2, rSHR
@@ -796,12 +657,7 @@ L(dusP1):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
@@ -832,27 +688,21 @@ L(duPs4):
    compare length is at least 8 bytes.  */
 	.align	4
 L(DWunaligned):
-	std	rWORD8_SHIFT, -40(r1)
-	cfi_offset(rWORD8_SHIFT, -40)
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
 	clrrdi	rSTR2, rSTR2, 3
-	std	rWORD2_SHIFT, -48(r1)
-	cfi_offset(rWORD2_SHIFT, -48)
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
 	srdi	r0, rN, 5	/* Divide by 32 */
-	std	rWORD4_SHIFT, -56(r1)
-	cfi_offset(rWORD4_SHIFT, -56)
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
 	andi.	r12, rN, 24	/* Get the DW remainder */
-	std	rWORD6_SHIFT, -64(r1)
-	cfi_offset(rWORD6_SHIFT, -64)
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
 	sldi	rSHL, rSHL, 3
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD6, 0, rSTR2
+	LD	rWORD6, 0, rSTR2
+	LD	rWORD8, rOFF8, rSTR2
 	addi	rSTR2, rSTR2, 8
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD6, 0(rSTR2)
-	ldu	rWORD8, 8(rSTR2)
-#endif
 	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
@@ -867,52 +717,26 @@ L(DWunaligned):
 	.align	4
 L(duP1):
 	srd	r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD7, 0(rSTR1)
-#endif
+	LD	rWORD7, 0, rSTR1
 	sld	rWORD8_SHIFT, rWORD8, rSHL
 	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP1x)
 L(duP1e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	bne	cr5, L(duLcr5)
 	or	rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
@@ -932,82 +756,47 @@ L(duP1x):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
 	.align	4
 L(duP2):
 	srd	r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD5, 0(rSTR1)
-#endif
+	LD	rWORD5, 0, rSTR1
 	or	rWORD6, r0, rWORD6_SHIFT
 	sld	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 8(rSTR1)
-	ld	rWORD8, 8(rSTR2)
-#endif
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	srd	r12, rWORD8, rSHR
 	sld	rWORD8_SHIFT, rWORD8, rSHL
 	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP2x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 16(rSTR1)
-	ld	rWORD2, 16(rSTR2)
-#endif
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 24(rSTR1)
-	ld	rWORD4, 24(rSTR2)
-#endif
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	bne	cr5, L(duLcr5)
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	or	rWORD4, r12, rWORD2_SHIFT
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	cmpld	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
 	.align	4
 L(duP2x):
 	cmpld	cr5, rWORD7, rWORD8
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	bne	cr6, L(duLcr6)
 	sldi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
@@ -1015,12 +804,7 @@ L(duP2x):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
@@ -1028,73 +812,39 @@ L(duP2x):
 	.align	4
 L(duP3):
 	srd	r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD3, 0(rSTR1)
-#endif
+	LD	rWORD3, 0, rSTR1
 	sld	rWORD4_SHIFT, rWORD8, rSHL
 	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 8(rSTR1)
-	ld	rWORD6, 8(rSTR2)
-#endif
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
 	or	rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 16(rSTR1)
-	ld	rWORD8, 16(rSTR2)
-#endif
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
 	srd	r12, rWORD8, rSHR
 	sld	rWORD8_SHIFT, rWORD8, rSHL
 	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP3x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 24(rSTR1)
-	ld	rWORD2, 24(rSTR2)
-#endif
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
 	cmpld	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
 	.align	4
 L(duP3x):
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
-#if 0
-/* Huh?  We've already branched on cr1!  */
-	bne	cr1, L(duLcr1)
-#endif
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	sldi.	rN, rN, 3
@@ -1103,12 +853,7 @@ L(duP3x):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
@@ -1117,51 +862,27 @@ L(duP3x):
 L(duP4):
 	mtctr	r0
 	srd	r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-#endif
+	LD	rWORD1, 0, rSTR1
 	sld	rWORD2_SHIFT, rWORD8, rSHL
 	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 8(rSTR1)
-	ld	rWORD4, 8(rSTR2)
-#endif
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	or	rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 16(rSTR1)
-	ld	rWORD6, 16(rSTR2)
-#endif
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr7, L(duLcr7)
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
 	or	rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 24(rSTR1)
-	ldu	rWORD8, 24(rSTR2)
-#endif
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
 	srd	r12, rWORD8, rSHR
@@ -1172,60 +893,34 @@ L(duP4e):
 /* This is the primary loop */
 	.align	4
 L(duLoop):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr7, L(duLcr7)
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
 	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 32(rSTR1)
-	ldu	rWORD8, 32(rSTR2)
-#endif
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
 	cmpld	cr7, rWORD1, rWORD2
 	bne	cr1, L(duLcr1)
 	srd	r12, rWORD8, rSHR
@@ -1234,10 +929,6 @@ L(duLoop3):
 	bdnz	L(duLoop)
 
 L(duL4):
-#if 0
-/* Huh?  We've already branched on cr1!  */
-	bne	cr1, L(duLcr1)
-#endif
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	cmpld	cr6, rWORD5, rWORD6
@@ -1264,99 +955,102 @@ L(du14):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	.align	4
 L(dutrim):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-#else
-	ld	rWORD1, 8(rSTR1)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
 	ld	rWORD8, -8(r1)
 	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
 	or	rWORD2, r0, rWORD8_SHIFT
-	ld	rWORD7, -16(r1)
-	ld	rSHL, -24(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	ld	rSHL, rSHLSAVE(r1)
 	srd	rWORD1, rWORD1, rN
 	srd	rWORD2, rWORD2, rN
-	ld	rSHR, -32(r1)
-	ld	rWORD8_SHIFT, -40(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
 	li	rRTN, 0
 	cmpld	cr7, rWORD1, rWORD2
-	ld	rWORD2_SHIFT, -48(r1)
-	ld	rWORD4_SHIFT, -56(r1)
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
 	beq	cr7, L(dureturn24)
 	li	rRTN, 1
-	ld	rWORD6_SHIFT, -64(r1)
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	bgtlr	cr7
 	li	rRTN, -1
 	blr
 	.align	4
 L(duLcr7):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr7, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr1):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr1, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr6):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr6, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr5):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr5, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
+
 	.align	3
 L(duZeroReturn):
 	li	rRTN, 0
 	.align	4
 L(dureturn):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dureturn29):
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 L(dureturn27):
-	ld	rWORD8_SHIFT, -40(r1)
-L(dureturn26):
-	ld	rWORD2_SHIFT, -48(r1)
-L(dureturn25):
-	ld	rWORD4_SHIFT, -56(r1)
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
 L(dureturn24):
-	ld	rWORD6_SHIFT, -64(r1)
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	blr
+
 L(duzeroLength):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
index 6b8999dc1f..14df042785 100644
--- a/sysdeps/powerpc/powerpc64/power7/memset.S
+++ b/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -383,7 +383,6 @@ L(small):
 END_GEN_TB (memset,TB_TOCLESS)
 libc_hidden_builtin_def (memset)
 
-#ifndef NO_BZERO_IMPL
 /* Copied from bzero.S to prevent the linker from inserting a stub
    between bzero and memset.  */
 ENTRY (__bzero)
@@ -391,7 +390,7 @@ ENTRY (__bzero)
 	mr	r5,r4
 	li	r4,0
 	b	L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
 weak_alias (__bzero, bzero)
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S
index f16a9d8a88..ade2811a6e 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S
@@ -25,122 +25,96 @@
 
 /* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])  */
 
+	.machine	power7
 EALIGN (strcmp, 4, 0)
 	CALL_MCOUNT 2
 
 	or r9, r3, r4
 	rldicl. r10, r9, 0, 61	/* are s1 and s2 8 byte aligned..?  */
 	bne cr0, L(process_unaligned_bytes)
+	li	r5, 0
 
+	.align 4
 /* process input parameters on double word aligned boundary  */
-	ld r9, 0(r4)		/* load s2 at offset=0  */
-	li r10, 0		/* load mask=0  */
-	cmpb r10, r9, r10	/* compare bytes at s2 with mask  */
-	cmpdi cr7, r10, 0	/* is NULL found ..? is end of string HIT  */
-	bne cr7, L(process_unaligned_bytes)	/* process byte by byte  */
-
-	ld r10, 0(r3)		/* load s1 at offset=0  */
-	li r8, 0		/* load mask=0  */
-	cmpb r8, r10, r8	/* compare bytes at s1 with mask  */
-	cmpdi cr7, r8, 0	/* is NULL found ..? is end of string HIT  */
-	bne cr7, L(process_unaligned_bytes)	/* process byte by byte  */
-
-/*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO  */
-	cmpb r9, r10, r9	/* compare s1 and s2  */
-	cmpdi cr7, r9, -1	/* compare result with 0xFFFFFFFFFFFFFFFF  */
-	bne cr7, L(process_unaligned_bytes)	/* s1,s2 mismatch found  */
-
-	addi r5, r3, 8		/* save next offset of s2  */
-	addi r11, r4, 8		/* save next offset of s1  */
-	ld r8, 8(r4)		/* load s2 at offset=8  */
-	li r9, 0		/* load mask=0  */
-	cmpb r9, r8, r9		/* compare bytes at s2 with mask  */
-	cmpdi cr7, r9, 0	/* NULL found ..?  */
-	bne cr7, L(processBytes)/* update input and process bytes one by one  */
-
-	mr r9, r4		/* save s2  */
-	li r10, 0		/* load mask=0  */
-
-	ld r7, 8(r3)		/* load s1 at offset=8  */
-	cmpb r6, r7, r10	/* compare bytes at s1 with mask  */
-	cmpdi cr7, r6, 0	/* is NULL found  */
-	bne cr7, L(processBytes)/* mismatch, so process one by one  */
-
 L(unrollDword):
-	cmpb r8, r7, r8		/* compare s1 and s2  */
-	cmpdi cr7, r8, -1	/* compare result with 0xFFFFFFFFFFFFFFFF  */
-	bne cr7, L(processBytes)/* mismatch with s1 and s2  */
-
-	addi r5, r3, 16		/* save offset=16 of s1  */
-	addi r4, r9, 16		/* save offset=16 of s2  */
-	ld r8, 16(r9)		/* load s2 at offset=16  */
-	cmpb r7, r8, r10	/* compare bytes at s2 with mask  */
-	cmpdi cr7, r7, 0	/* NULL found  ..?  */
-	bne cr7, L(update2processBytes)
-
-	ld r7, 16(r3)		/* load s1 at offset=16  */
-	cmpb r6, r7, r10	/* check s1 for end of string  */
-	cmpdi cr7, r6, 0	/* end of s1 ?,then handle byte by byte  */
-	bne 7,L(update2processBytes)
-
-	cmpb r8, r7, r8		/* compare s1 and s2 double words  */
-	cmpdi cr7, r8, -1	/* compare results with 0xFFFFFFFFFFFFFFFF  */
-	bne cr7,L(update2processBytes)
-
-	addi r5, r3, 24		/* update s1 to offset=24  */
-	addi r4, r9, 24		/* update s2 to offset=24  */
-
-	ld r8, 24(r9)		/* load s2  */
-	cmpb r7, r8, r10	/* compare s2 for NULL  */
-	cmpdi cr7, r7, 0	/* verify if s2 is ending now  */
-	bne cr7,L(update2processBytes)
-
-	ld r7, 24(r3)		/* load s1 at offset=24  */
-	cmpb r6, r7, r10	/* verify for NULL  */
-	cmpdi cr7, r6, 0	/* is NULL found  */
-	bne cr7, L(update2processBytes)
-
-	cmpb r8, r7, r8		/* compare s1 and s2  */
-	cmpdi cr7, r8, -1	/* are s1 and s2 same ..?  */
-	bne cr7, L(update2processBytes)
-
-	addi r7, r9, 32		/* update s2 to next double word  */
-	addi r3, r3, 32		/* update s1 to next double word  */
-
-	ld r8, 32(r9)		/* load s2  */
-	mr r4, r7		/* save s2  */
-	cmpb r6, r8, r10	/* compare s2 with NULL  */
-	cmpdi cr7, r6, 0	/* end of s2 ..? */
-	bne cr7, L(process_unaligned_bytes)
-
-	ld r6, 0(r3)		/* load and compare s1 for NULL  */
-	cmpb r5, r6, r10
-	cmpdi cr7, r5, 0
-	bne cr7, L(process_unaligned_bytes)
-
-	cmpb r8, r6, r8		/* compare s1 and s2  */
-	cmpdi cr7, r8, -1
-	bne cr7, L(process_unaligned_bytes)
-
-	addi r5, r3, 8		/* increment s1 and d2 here  */
-	addi r11, r9, 40
-
-	ld r8, 40(r9)		/* process s2 now  */
-	cmpb r9, r8, r10
-	cmpdi cr7, r9, 0
-	bne cr7, L(processBytes)
-
-	mr r9, r7
-	ld r7, 8(r3)		/* process s1 now  */
-	cmpb r6, r7, r10
-	cmpdi cr7, r6, 0
-	beq cr7, L(unrollDword)	/* unroll to compare s1 and s2  */
-
-L(processBytes):
-	mr r4, r11		/* update input params  */
-	mr r3, r5
-
-	.p2align 4
+	ld	r8,0(r3)
+	ld	r10,0(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,8(r3)
+	ld	r10,8(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,16(r3)
+	ld	r10,16(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,24(r3)
+	ld	r10,24(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	addi r3, r3, 32
+	addi r4, r4, 32
+	beq cr7, L(unrollDword)
+
+	.align 4
+L(null_found):
+#ifdef __LITTLE_ENDIAN__
+	neg	r7,r9
+	and	r9,r9,r7
+	li	r7,-1
+	cntlzd	r9,r9
+	subfic	r9,r9,71
+	sld	r9,r7,r9
+#else
+	cntlzd	r9,r9
+	li	r7,-1
+	addi	r9,r9,8
+	srd	r9,r7,r9
+#endif
+	or	r8,r8,r9
+	or	r10,r10,r9
+
+L(different):
+	cmpb	r9,r8,r10
+#ifdef __LITTLE_ENDIAN__
+	addi	r7,r9,1
+	andc	r9,r7,r9
+	cntlzd	r9,r9
+	subfic	r9,r9,63
+#else
+	not	r9,r9
+	cntlzd	r9,r9
+	subfic	r9,r9,56
+#endif
+	srd	r3,r8,r9
+	srd	r10,r10,r9
+	rldicl	r10,r10,0,56
+	rldicl	r3,r3,0,56
+	subf	r3,r10,r3
+	blr
+
+	.align 4
 L(process_unaligned_bytes):
 	lbz r9, 0(r3)		/* load byte from s1  */
 	lbz r10, 0(r4)		/* load byte from s2  */
@@ -172,24 +146,19 @@ L(process_unaligned_bytes):
 	addi r4, r4, 4		/* increment s2 by unroll factor  */
 	beq cr6, L(process_unaligned_bytes)	/* unroll byte processing  */
 
-	.p2align 4
+	.align 4
 L(ComputeDiff):
 	extsw r9, r9
 	subf r10, r10, r9	/* compute s1 - s2  */
 	extsw r3, r10
 	blr			/* return  */
 
-	.p2align 4
+	.align 4
 L(diffOfNULL):
 	li r9, 0
 	subf r10, r10, r9	/* compute s1 - s2  */
 	extsw r3, r10		/* sign extend result  */
 	blr			/* return  */
 
-	.p2align 4
-L(update2processBytes):
-	mr r3, r5		/* update and proceed  */
-	b L(process_unaligned_bytes)
-
 END (strcmp)
 libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S
index ce71982eaf..115f98a304 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S
@@ -31,8 +31,6 @@
 
    if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
      goto aligned_doubleword_copy;
-   if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
-     goto aligned_word_copy;
    if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
      goto same_alignment;
    goto unaligned;
@@ -70,9 +68,18 @@ EALIGN (FUNC_NAME, 4, 0)
 #endif
 	or	rTMP, rSRC, rRTN
 	clrldi.	rTMP, rTMP, 61
-	bne	L(check_word_alignment)
+	bne	L(check_alignment)
 	b	L(aligned_doubleword_copy)
 
+	.align 4
+L(check_alignment):
+	rldicl	rRTNAL, rRTN, 0, 61
+	rldicl	rSRCAL, rSRC, 0, 61
+	cmpld	cr7, rSRCAL, rRTNAL
+	beq	cr7, L(same_alignment)
+	b	L(unaligned)
+
+	.align 4
 L(same_alignment):
 /* Src and dst with same alignment: align both to doubleword.  */
 	mr	rALCNT, rRTN
@@ -180,93 +187,249 @@ L(g1):
 #endif
 	blr
 
-L(check_word_alignment):
-	clrldi. rTMP, rTMP, 62
-	beq	L(aligned_word_copy)
-	rldicl	rRTNAL, rRTN, 0, 61
-	rldicl	rSRCAL, rSRC, 0, 61
-	cmpld	cr7, rSRCAL, rRTNAL
-	beq	cr7, L(same_alignment)
-	b	L(unaligned)
-
-/* For word aligned memory, operate using word load and stores.  */
 	.align	4
-L(aligned_word_copy):
-	li	rMASK, 0
-	addi	rRTN, rRTN, -4
-	lwz	rWORD, 0(rSRC)
-	b	L(g5)
+L(unaligned):
+	cmpdi	rSRCAL, 0		/* Check src alignment */
+	beq	L(srcaligndstunalign)
+	/* src is unaligned */
+	rlwinm	r10, rSRC, 3,26,28	/* Calculate padding.  */
+	clrrdi	rSRC, rSRC, 3		/* Align the addr to dw boundary */
+	ld	rWORD, 0(rSRC)		/* Load doubleword from memory.  */
+	li	rTMP, 0
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	srd	rALT, rWORD, r10
+#else
+	sld	rALT, rWORD, r10
+#endif
+	cmpb	rTMP, rALT, rTMP	/* Compare each byte against null */
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	sld	rTMP, rTMP, r10
+#else
+	srd	rTMP, rTMP, r10
+#endif
+	cmpdi	rTMP, 0
+	bne	L(bytebybyte)		/* if it has null, copy byte by byte */
+	subfic	r8, r9, 8
+	rlwinm	r5, rRTN, 3,26,28	/* Calculate padding in bits.  */
+	rldicl	r9, rRTN, 0, 61		/* Calculate padding in bytes. */
+	addi	rRTN, rRTN, -1
 
-	.align	4
-L(g3):	lwzu	rALT, 4(rSRC)
-	stwu	rWORD, 4(rRTN)
-	cmpb	rTMP, rALT, rMASK
-	cmpwi	rTMP, 0
-	bne	L(g4)
-	lwzu	rWORD, 4(rSRC)
-	stwu	rALT, 4(rRTN)
-L(g5):	cmpb	rTMP, rWORD, rMASK
-	cmpwi	rTMP, 0		/* If rTMP is 0, no null in word.  */
-	beq	L(g3)
-
-	mr      rALT, rWORD
-/* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g4):
+	cmpdi	r5, 0			/* check dest alignment */
+	beq	L(srcunaligndstalign)
+
+	/* both src and dst unaligned */
 #ifdef __LITTLE_ENDIAN__
-	rlwinm.	rTMP, rALT, 0, 24, 31
-	stbu	rALT, 4(rRTN)
-	beqlr-
-	rlwinm.	rTMP, rALT, 24, 24, 31
-	stbu	rTMP, 1(rRTN)
-	beqlr-
-	rlwinm.	rTMP, rALT, 16, 24, 31
-	stbu	rTMP, 1(rRTN)
-	beqlr-
-	rlwinm	rTMP, rALT, 8, 24, 31
-	stbu	rTMP, 1(rRTN)
+	sld	rWORD, rALT, r10
+	mr 	r11, r10
+	addi	r11, r11, -8		/* Adjust byte pointer on loaded dw */
 #else
-	rlwinm. rTMP, rALT, 8, 24, 31
-	stbu    rTMP, 4(rRTN)
-	beqlr
-	rlwinm. rTMP, rALT, 16, 24, 31
-	stbu    rTMP, 1(rRTN)
-	beqlr
-	rlwinm. rTMP, rALT, 24, 24, 31
-	stbu    rTMP, 1(rRTN)
-	beqlr
-	stbu    rALT, 1(rRTN)
+	srd	rWORD, rALT, r10
+	subfic	r11, r10, 64
 #endif
-	blr
+	/* dst alignment is greater then src alignment? */
+	cmpd	cr7, r5, r10
+	blt	cr7, L(dst_align_small)
+	/* src alignment is less than dst */
 
-/* Oh well.  In this case, we just do a byte-by-byte copy.  */
-	.align	4
-L(unaligned):
-	lbz	rWORD, 0(rSRC)
-	addi	rRTN, rRTN, -1
-	cmpdi	rWORD, 0
-	beq	L(u2)
-
-	.align 	5
-L(u0):	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rRTN)
-	cmpdi	rALT, 0
-	beq	L(u1)
-	lbzu	rWORD, 1(rSRC)
+	/* Calculate the dst alignment differnce */
+	subfic	rALT, r9, 8
+	mtctr	rALT
+
+	/* Write till dst is aligned */
+	cmpdi	rTMP, rALT, 4
+	blt	L(storebyte1)		/* less than 4, store byte by byte */
+	beq	L(equal1)		/* if its 4, store word */
+	addi	rTMP, rALT, -4		/* greater than 4, so stb and stw */
+	mtctr	rTMP
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on loaded dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
+	stbu	rALT, 1(rRTN)
+	bdnz	L(storebyte1)
+
+	subfic	rALT, r9, 8		/* Check the remaining bytes */
+	cmpdi	rTMP, rALT, 4
+	blt	L(proceed)
+
+	.align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on loaded dw */
+	srd	rALT, rWORD, r11
+#else
+	subfic	r11, r11, 64
+	sld	rALT, rWORD, r11
+	srdi	rALT, rALT, 32
+#endif
+	stw	rALT, 1(rRTN)
+	addi	rRTN, rRTN, 4
+
+L(proceed):
+	mr	rALT, rWORD
+	/* calculate the Left over bytes to be written */
+	subfic	r11, r10, 64
+	subfic	r5, r5, 64
+	subf	r5, r5, r11		/* remaining bytes on second dw */
+        subfic	r10, r5, 64		/* remaining bytes on first dw */
+	subfic	r9, r9, 8
+	subf	r8, r9, r8		/* recalculate padding */
+L(srcunaligndstalign):
+	addi	rRTN, rRTN, 1
+	subfic	r5, r10, 64		/* remaining bytes on second dw */
+	addi	rSRC, rSRC, 8
+	li	rTMP,0
+	b	L(storedouble)
+
+	.align 4
+L(dst_align_small):
+	mtctr	r8
+	/* Write till src is aligned */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
 	stbu	rALT, 1(rRTN)
-	cmpdi	rWORD, 0
-	beq	L(u2)
-	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rRTN)
-	cmpdi	rALT, 0
-	beq	L(u1)
-	lbzu	rWORD, 1(rSRC)
+	bdnz	L(storebyte2)
+
+	addi	rSRC, rSRC, 8		/* Increment src pointer */
+	addi	rRTN, rRTN, 1		/* Increment dst pointer */
+	rldicl	r8, rRTN, 0, 61		/* Recalculate padding */
+
+	/* src is aligned */
+L(srcaligndstunalign):
+	ld	rWORD, 0(rSRC)
+	mr	rALT, rWORD
+	li	rTMP, 0			/* Check null */
+	cmpb	rTMP, rWORD, rTMP
+	cmpdi	rTMP, 0
+	bne	L(bytebybyte)		/* Do byte by byte if there is NULL */
+	rlwinm	r5, rRTN, 3,26,28	/* Calculate padding */
+	addi	rRTN, rRTN, -1
+	subfic	r10, r8, 8
+	/* write byte by byte till aligned */
+#ifdef __LITTLE_ENDIAN__
+	li	r11, -8
+#else
+	li	r11, 64
+#endif
+	mtctr	r10
+	cmpdi	rTMP, r10, 4
+	blt	L(storebyte)
+	beq	L(equal)
+	addi	rTMP, r10, -4
+	mtctr	rTMP
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on  dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
 	stbu	rALT, 1(rRTN)
-	cmpdi	rWORD, 0
-	bne	L(u0)
-L(u2):	stbu	rWORD, 1(rRTN)
-	blr
-L(u1):	stbu	rALT, 1(rRTN)
-	blr
+	bdnz	L(storebyte)
+
+	cmpdi	rTMP, r10, 4
+	blt	L(align)
+
+	.align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8
+	srd	rALT, rWORD, r11
+#else
+	subfic	r11, r11, 64
+	sld	rALT, rWORD, r11
+	srdi	rALT, rALT, 32
+#endif
+	stw	rALT, 1(rRTN)
+	addi	rRTN, rRTN, 4
+L(align):
+	addi	rRTN, rRTN, 1
+	addi	rSRC, rSRC, 8		/* Increment src pointer */
+	subfic	r10, r5, 64
+	li	rTMP, 0
+	/* dst addr aligned to 8 */
+L(storedouble):
+	ld	rALT, 0(rSRC)		/* load next dw */
+	cmpb	rTMP, rALT, rTMP
+	cmpdi	rTMP, 0			/* check for null on each new dw */
+	bne	L(null)
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, rWORD, r10		/* bytes from first dw */
+	sld	r11, rALT, r5		/* bytes from second dw */
+#else
+	sld	r9, rWORD, r10
+	srd	r11, rALT, r5
+#endif
+	or	r11, r9, r11		/* make as a single dw */
+	std	r11, 0(rRTN)		/* store as std on aligned addr */
+	mr	rWORD, rALT		/* still few bytes left to be written */
+	addi	rRTN, rRTN, 8		/* increment dst addr */
+	addi	rSRC, rSRC, 8		/* increment src addr */
+	b	L(storedouble)		/* Loop till NULL */
+
+	.align 4
+
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(null):
+	addi	rRTN, rRTN, -1
+	mr	r10, r5
+	mtctr	r8
+#ifdef __LITTLE_ENDIAN__
+	subfic	r10, r10, 64
+	addi	r10, r10, -8
+#endif
+	cmpdi	rTMP, r8, 4
+	blt	L(loop)
+
+	/* we can still use stw if leftover >= 4*/
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+	srd	r11, rWORD, r10
+#else
+	subfic	r10, r10, 64
+	sld	r11, rWORD, r10
+	srdi	r11, r11, 32
+#endif
+	stw	r11, 1(rRTN)
+	addi	rRTN, rRTN, 4
+
+	beq	L(bytebybyte1)
+	addi	r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, -8
+#else
+	subfic	r10, r10, 64
+#endif
+	addi	rTMP, r8, -4
+	mtctr	rTMP
+	/* remaining byte by byte part of first dw */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+#else
+	addi	r10, r10, -8
+#endif
+	srd	rTMP, rWORD, r10
+	stbu	rTMP, 1(rRTN)
+	bdnz	L(loop)
+
+L(bytebybyte1):
+	addi	rRTN, rRTN, 1
+	/* remaining byte by byte part of second dw */
+L(bytebybyte):
+	addi	rRTN, rRTN, -8
+	b	L(g1)
+
 END (FUNC_NAME)
 
 #ifndef USE_AS_STPCPY
diff --git a/sysdeps/powerpc/powerpc64/power7/strncat.S b/sysdeps/powerpc/powerpc64/power7/strncat.S
deleted file mode 100644
index f5ea52d3d4..0000000000
--- a/sysdeps/powerpc/powerpc64/power7/strncat.S
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Optimized strncat implementation for PowerPC64/POWER7.
-
-   Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* The algorithm is as follows for aligned memory access :
-
-   if address of s2 is divisible by 0x7UL,
-       perform aligned doubleword catenation
-   else
-       perform unaligned catenation
-
-   The aligned comparison are made using cmpb instructions.  */
-
-/* char* [r3] strncat (const char *s1 [r3],
-                       const char *s2 [r4],
-                       size_t size [r5])  */
-
-#include <sysdep.h>
-
-#ifndef STRNCAT
-# undef strncat
-# define STRNCAT  strncat
-#endif
-
-#ifndef STRLEN
-/* For builds with no IFUNC support, local calls should be made to internal
-   GLIBC symbol (created by libc_hidden_builtin_def).  */
-# ifdef SHARED
-#  define STRLEN   __GI_strlen
-# else
-#  define STRLEN   strlen
-# endif
-#endif
-
-#define	FRAMESIZE	(FRAME_MIN_SIZE+32)
-
-	.machine  power7
-EALIGN(STRNCAT, 4, 0)
-	CALL_MCOUNT 3
-
-	mflr r0				/* Load link register LR to r0.  */
-
-/* We shall use r29, r30 and r31 non volatile register for retention.
-   Save all the callee registers in the GPR save area.  */
-	std r29, -24(r1)		/* Save callers register r29.  */
-	std r30, -16(r1)		/* Save callers register r30.  */
-	std r31, -8(r1)			/* Save callers register r31.  */
-
-	std r0, 16(r1)			/* Store the link register.  */
-	stdu r1, -FRAMESIZE(r1)		/* Create the stack frame.  */
-
-/* Improve performance with CPU pre-fetch.  */
-	dcbt 0, r3			/* Pre-fetch str to avoid cache
-					   miss.  */
-	dcbt 0, r4			/* Pre-fetch accept to avoid cache
-					   miss.  */
-
-	mr. r29, r5			/* Save "n" in r29.  */
-	mr r30, r3			/* Save "s1" in r30 from r3.  */
-	beq cr0,L(done)
-
-	mr r31, r4			/* Save "s2" in r31 from r4.  */
-	bl STRLEN			/* Call optimized strlen on s1; goto
-					   end of s1.  */
-	nop
-	cmpldi cr7, r29, 7		/* If s2 is <=7 process
-					    byte-by-byte.  */
-	add r3, r30, r3			/* Grab the last character of s1.  */
-	bgt cr7,L(alignment)		/* Process by aligned strings.  */
-
-	cmpldi cr7, r29, 3		/* If n is >= 4, we can
-					   byte-unroll.  */
-	addi r9, r3, -1			/* Make "s1" point before next
-					   character, increment when read.  */
-	bgt cr7, L(bytes_unroll)	/* Process each byte.  */
-
-L(byte_by_byte):
-	lbz r10, 0(r31)
-	addi r8, r9, 1
-	cmpdi cr7, r10, 0		/* Check for NULL in "s2".  */
-	stb r10, 1(r9)
-	beq cr7, L(done)
-	add r9, r9, r29
-	subf r9, r8, r9
-	addi r9, r9, 1
-	mtctr r9
-	b L(branch2)
-	.p2align 4
-L(branch1):
-	lbzu r10, 1(r31)
-	cmpdi cr7, r10, 0
-	stbu r10, 1(r8)
-	beq cr7,L(done)
-L(branch2):
-	mr r9, r8
-	bdnz L(branch1)
-	beq cr7,L(done)
-L(nullTerminate):
-	li r10, 0			/* Load NULL for termination.  */
-	stb r10, 1(r9)			/* Append or terminate s1 with
-					   NULL.  */
-	.p2align 4			/* A small section here.  */
-L(done):				/* We return now.   */
-	addi r1, r1, FRAMESIZE		/* Restore stack pointer.  */
-	mr r3, r30			/* Set the return value length of
-					   string.  */
-	ld r0, 16(r1)			/* Read the saved link register.  */
-	ld r29, -24(r1)			/* Restore save register r29.  */
-	ld r30, -16(r1)			/* Restore save register r30.  */
-	ld r31, -8(r1)			/* Restore save register r31.  */
-	mtlr r0				/* Restore link register.  */
-	blr				/* Branch to link register.  */
-
-	.p2align 4
-L(alignment):
-	rldicl. r9, r31, 0, 61		/* Check if s2 is 8byte aligned  */
-	beq cr0,L(dwordAligned)
-
-	.p2align 4
-/* Unaligned bytes in string, so process byte by byte.
-   POWER7 has performance gains over loop unroll.  */
-L(bytes_unroll):
-	addi r9, r3, -1
-	srdi r10, r29, 2
-	mtctr r10
-	b L(L10)
-	.p2align 4
-L(L44):
-	lbz r10, 1(r31)			/* Load byte.  */
-	cmpdi cr7, r10, 0		/* Compare ; if byte not zero,
-					   continue.  */
-	stb r10, 2(r9)			/* Store byte  */
-	beq cr7, L(done)
-	addi r31, r31, 4
-
-	lbz r10, -2(r31)		/* Perform loop unroll here on byte
-					   load and store.  */
-	cmpdi cr7, r10, 0
-	stb r10, 3(r9)
-	beq cr7, L(done)
-
-	lbz r10, -1(r31)		/* Loop unroll here.  */
-	cmpdi cr7, r10, 0
-	stbu r10, 4(r9)
-	beq cr7, L(done)
-
-	bdz L(leftNbytes)
-
-L(L10):
-	lbz r10, 0(r31)			/* Loop unroll here.  */
-	cmpdi cr7, r10, 0
-	stb r10, 1(r9)
-	bne cr7,L(L44)
-	b L(done)
-	.p2align 4
-/* If s2 is double word aligned, we load and store double word.  */
-L(dwordAligned):
-/* read, write 8 bytes at a time  */
-	srdi r8, r29, 3			/* Compute count for CTR to loop;
-					   count = n/8.  */
-	li r7, 0			/* Load r7 with NULL.  */
-	li r10, 0			/* Load r10 with MASK '0'.  */
-
-	mtctr r8			/* Move count to CTR.  */
-L(loop8):
-	ld r9, 0(r31)			/* Read double word from s2.  */
-	cmpb r6, r9, r10		/* Compare bytes in s2 we read
-					   just now.  */
-	cmpdi r6, 0			/* If cmpb returned NULL,
-					   we continue.  */
-	bne+ L(a8)
-	std r9, 0(r3)			/* Append double word from s2
-					   with s1.  */
-	addi r3, r3, 8			/* Increment s1.  */
-	addi r31, r31, 8		/* Increment s2.  */
-	subi r29, r29, 8		/* Decrement count by 8.  */
-	bdnz L(loop8)			/* Continue until "count" is
-					   non zero.  */
-
-L(a8):
-	cmpdi r29, 0			/* If "n" is already zero, we skip. */
-	beq+ L(align8align)
-
-	mtctr r29			/* Process left over bytes in "n".  */
-L(unaligned0):
-	lbz r9, 0(r31)			/* Read a byte from s2.  */
-	cmpw r9, r7			/* If byte is NULL, we stop here . */
-	beq+ L(align8align)		/* Skip processing further if NULL.  */
-	stb  r9, 0(r3)			/* If not NULL, store byte into s1.  */
-	addi r3, r3, 1			/* Increment s1 by 1.  */
-	addi r31, r31, 1		/* Increment s2 by 1.  */
-	bdnz L(unaligned0)		/* Decrement counter "n" and loop
-					   until non zero.  */
-L(align8align):
-	stb r7, 0(r3)			/* Terminate s1 with NULL.  */
-
-	addi r1, r1, FRAMESIZE		/* Restore stack pointer.  */
-	mr r3, r30			/* Set the return value, length of
-					   string.  */
-	ld r0, 16(r1)			/* Read the saved link register.  */
-	ld r29, -24(r1)			/* Restore save register r29.  */
-	ld r30, -16(r1)			/* Restore save register r30.  */
-	ld r31, -8(r1)			/* Restore save register r31.  */
-	mtlr r0				/* Restore link register.  */
-	blr				/* Branch to link register  */
-
-	.p2align 4
-L(leftNbytes):
-	rldicl. r29, r29, 0, 62		/* Check if n>0 and n < 4 bytes.  */
-	bne cr0,L(byte_by_byte)		/* Process bytes one by one. */
-	b L(nullTerminate)		/* Now, finish catenation with
-					   NULL termination.  */
-END(STRNCAT)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
index 2b27e7b923..3e981265ab 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
@@ -17,14 +17,9 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <endian.h>
 #include <math_ldbl_opt.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1  .byte 0x66,0x00,0x23,0x7c     /* mfvsrd  r3,vs1  */
-#else
-#define MFVSRD_R3_V1  .byte 0x7c,0x23,0x00,0x66     /* mfvsrd  r3,vs1  */
-#endif
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* int [r3] __finite ([fp1] x)  */
 
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
index d09b7fcef9..125de3943d 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
@@ -17,14 +17,9 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <endian.h>
 #include <math_ldbl_opt.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1  .byte 0x66,0x00,0x23,0x7c     /* mfvsrd  r3,vs1  */
-#else
-#define MFVSRD_R3_V1  .byte 0x7c,0x23,0x00,0x66     /* mfvsrd  r3,vs1  */
-#endif
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* int [r3] __isinf([fp1] x)  */
 
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
index cf119e5c98..2c7b2d1d9a 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
@@ -17,14 +17,9 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <endian.h>
 #include <math_ldbl_opt.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1  .byte 0x66,0x00,0x23,0x7c     /* mfvsrd  r3,vs1  */
-#else
-#define MFVSRD_R3_V1  .byte 0x7c,0x23,0x00,0x66     /* mfvsrd  r3,vs1  */
-#endif
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* int [r3] __isnan([f1] x)  */
 
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
index 9a55d93875..ce48d4e52c 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
@@ -17,14 +17,9 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <endian.h>
 #include <math_ldbl_opt.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1  .byte 0x66,0x00,0x23,0x7c     /* mfvsrd  r3,vs1  */
-#else
-#define MFVSRD_R3_V1  .byte 0x7c,0x23,0x00,0x66     /* mfvsrd  r3,vs1  */
-#endif
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* long long int[r3] __llrint (double x[fp1])  */
 ENTRY (__llrint)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
index f10c06a36c..17cf30eaf1 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
@@ -20,11 +20,7 @@
 #include <endian.h>
 #include <math_ldbl_opt.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1  .byte 0x66,0x00,0x23,0x7c     /* mfvsrd  r3,vs1  */
-#else
-#define MFVSRD_R3_V1  .byte 0x7c,0x23,0x00,0x66     /* mfvsrd  r3,vs1  */
-#endif
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* long long [r3] llround (float x [fp1])  */
 
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
new file mode 100644
index 0000000000..d7324dc54a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -0,0 +1,451 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define MTVSRD_V1_R4  .long 0x7c240166     /* mtvsrd  v1,r4  */
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+	.machine power8
+EALIGN (memset, 5, 0)
+	CALL_MCOUNT 3
+
+L(_memset):
+	cmpldi	cr7,r5,31
+	neg	r0,r3
+	mr	r10,r3
+
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32	/* Replicate byte to word.  */
+	ble	cr7,L(write_LT_32)
+
+	andi.	r11,r10,15	/* Check alignment of DST.  */
+	insrdi	r4,r4,32,0	/* Replicate word to double word.  */
+
+	beq	L(big_aligned)
+
+	mtocrf	0x01,r0
+	clrldi	r0,r0,60
+
+	/* Get DST aligned to 16 bytes.  */
+1:	bf	31,2f
+	stb	r4,0(r10)
+	addi	r10,r10,1
+
+2:	bf	30,4f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+4:	bf	29,8f
+	stw	r4,0(r10)
+	addi	r10,r10,4
+
+8:	bf      28,16f
+	std     r4,0(r10)
+	addi    r10,r10,8
+
+16:	subf	r5,r0,r5
+
+	.align	4
+L(big_aligned):
+	/* For sizes larger than 255 two possible paths:
+	   - if constant is '0', zero full cache lines with dcbz
+	   - otherwise uses vector instructions.  */
+	cmpldi	cr5,r5,255
+	dcbtst	0,r10
+	cmpldi	cr6,r4,0
+	crand	27,26,21
+	bt	27,L(huge_dcbz)
+	bge	cr5,L(huge_vector)
+
+
+	/* Size between 32 and 255 bytes with constant different than 0, use
+	   doubleword store instruction to achieve best throughput.  */
+	srdi    r8,r5,5
+	clrldi  r11,r5,59
+	cmpldi  cr6,r11,0
+	cmpdi	r8,0
+	beq     L(tail_bytes)
+	mtctr   r8
+
+	/* Main aligned write loop, writes 32-bytes at a time.  */
+	.align  4
+L(big_loop):
+	std     r4,0(r10)
+	std     r4,8(r10)
+	std     r4,16(r10)
+	std     r4,24(r10)
+	addi    r10,r10,32
+	bdz     L(tail_bytes)
+
+	std     r4,0(r10)
+	std     r4,8(r10)
+	std     r4,16(r10)
+	std     r4,24(r10)
+	addi    r10,10,32
+	bdnz    L(big_loop)
+
+	b       L(tail_bytes)
+
+	/* Write remaining 1~31 bytes.  */
+	.align  4
+L(tail_bytes):
+	beqlr   cr6
+
+	srdi    r7,r11,4
+	clrldi  r8,r11,60
+	mtocrf  0x01,r7
+
+	.align	4
+	bf	31,8f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+
+	.align	4
+8:	mtocrf	0x1,r8
+	bf	28,4f
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+	.align	4
+4:	bf      29,2f
+	stw     4,0(10)
+	addi    10,10,4
+
+	.align 	4
+2:	bf      30,1f
+	sth     4,0(10)
+	addi    10,10,2
+
+	.align  4
+1:      bflr    31
+	stb     4,0(10)
+	blr
+
+	/* Size larger than 255 bytes with constant different than 0, use
+	   vector instruction to achieve best throughput.  */
+L(huge_vector):
+	/* Replicate set byte to quadword in VMX register.  */
+	MTVSRD_V1_R4
+	xxpermdi 32,v0,v1,0
+	vspltb	 v2,v0,15
+
+	/* Main aligned write loop: 128 bytes at a time.  */
+	li	r6,16
+	li	r7,32
+	li	r8,48
+	mtocrf	0x02,r5
+	srdi	r12,r5,7
+	cmpdi	r12,0
+	beq	L(aligned_tail)
+	mtctr	r12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128loop):
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+	bdnz	L(aligned_128loop)
+
+	/* Write remaining 1~127 bytes.  */
+L(aligned_tail):
+	mtocrf	0x01,r5
+	bf	25,32f
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+
+32:	bf	26,16f
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	addi	r10,r10,32
+
+16:	bf	27,8f
+	stvx	v2,0,r10
+	addi	r10,r10,16
+
+8:	bf	28,4f
+	std     r4,0(r10)
+	addi	r10,r10,8
+
+	/* Copies 4~7 bytes.  */
+4:	bf	29,L(tail2)
+	stw     r4,0(r10)
+	bf      30,L(tail5)
+	sth     r4,4(r10)
+	bflr	31
+	stb     r4,6(r10)
+	/* Return original DST pointer.  */
+	blr
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
+	   Before using dcbz though, we need to get the destination 128-byte
+	   aligned.  */
+	.align	4
+L(huge_dcbz):
+	andi.	r11,r10,127
+	neg	r0,r10
+	beq	L(huge_dcbz_aligned)
+
+	clrldi	r0,r0,57
+	subf	r5,r0,r5
+	srdi	r0,r0,3
+	mtocrf	0x01,r0
+
+	/* Write 1~128 bytes until DST is aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	std	r4,32(r10)
+	std	r4,40(r10)
+	std	r4,48(r10)
+	std	r4,56(r10)
+	addi	r10,r10,64
+
+	.align	4
+4:	bf	29,2f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	addi	r10,r10,32
+
+	.align	4
+2:	bf	30,1f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+
+	.align	4
+1:	bf	31,L(huge_dcbz_aligned)
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+L(huge_dcbz_aligned):
+	/* Setup dcbz unroll offsets and count numbers.  */
+	srdi	r8,r5,9
+	clrldi	r11,r5,55
+	cmpldi	cr6,r11,0
+	li	r9,128
+	cmpdi	r8,0
+	beq     L(huge_tail)
+	li	r7,256
+	li	r6,384
+	mtctr	r8
+
+	.align	4
+L(huge_loop):
+	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+	   a throughput boost for large sizes (2048 bytes or higher).  */
+	dcbz	0,r10
+	dcbz	r9,r10
+	dcbz	r7,r10
+	dcbz	r6,r10
+	addi	r10,r10,512
+	bdnz	L(huge_loop)
+
+	beqlr	cr6
+
+L(huge_tail):
+	srdi    r6,r11,8
+	srdi    r7,r11,4
+	clrldi  r8,r11,4
+	cmpldi  cr6,r8,0
+	mtocrf  0x01,r6
+
+	beq	cr6,L(tail)
+
+	/* We have 1~511 bytes remaining.  */
+	.align	4
+32:	bf	31,16f
+	dcbz	0,r10
+	dcbz	r9,r10
+	addi	r10,r10,256
+
+	.align	4
+16:	mtocrf  0x01,r7
+	bf	28,8f
+	dcbz	0,r10
+	addi	r10,r10,128
+
+	.align 	4
+8:	bf	29,4f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	std	r4,32(r10)
+	std	r4,40(r10)
+	std	r4,48(r10)
+	std	r4,56(r10)
+	addi	r10,r10,64
+
+	.align	4
+4:	bf	30,2f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	addi	r10,r10,32
+
+	.align	4
+2:	bf	31,L(tail)
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+	.align	4
+
+	/* Remaining 1~15 bytes.  */
+L(tail):
+	mtocrf  0x01,r8
+
+	.align
+8:	bf	28,4f
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+	.align	4
+4:	bf	29,2f
+	stw	r4,0(r10)
+	addi	r10,r10,4
+
+	.align	4
+2:	bf	30,1f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+	.align	4
+1:	bflr	31
+	stb	r4,0(r10)
+	blr
+
+	/* Handle short copies of 0~31 bytes.  Best throughput is achieved
+	   by just unrolling all operations.  */
+	.align	4
+L(write_LT_32):
+	cmpldi	cr6,5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(write_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	r8,r4
+	andi.	r0,r8,3
+	cmpldi	cr1,r5,16
+	beq	L(write_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,r0
+	subf	r5,r0,r5
+
+2:	bf	30,1f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+1:	bf	31,L(end_4bytes_alignment)
+	stb	r4,0(r10)
+	addi	r10,r10,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(write_LT_32_aligned):
+	blt	cr1,8f
+
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	stw	r4,8(r10)
+	stw	r4,12(r10)
+	addi	r10,r10,16
+
+8:	bf	28,L(tail4)
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	addi	r10,r10,8
+
+	.align	4
+	/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	stw	r4,0(r10)
+	bf	30,L(tail5)
+	sth	r4,4(r10)
+	bflr	31
+	stb	r4,6(r10)
+	blr
+
+	.align	4
+	/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	sth	r4,0(r10)
+	bflr	31
+	stb	r4,2(r10)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	stb	r4,4(r10)
+	blr
+
+	.align	4
+1: 	bflr	31
+	stb	r4,0(r10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(write_LE_8):
+	bne	cr6,L(tail4)
+
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	blr
+END_GEN_TB (memset,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+   between bzero and memset.  */
+ENTRY (__bzero)
+	CALL_MCOUNT 3
+	mr	r5,r4
+	li	r4,0
+	b	L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/stpcpy.S b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
new file mode 100644
index 0000000000..bf72065114
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpcpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPCPY
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/powerpc/powerpc64/power8/stpncpy.S b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
new file mode 100644
index 0000000000..76a146609f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
@@ -0,0 +1,20 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S
new file mode 100644
index 0000000000..223d891c2e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -0,0 +1,257 @@
+/* Optimized strcmp implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+   size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+EALIGN (strcmp, 4, 0)
+	li	r0,0
+
+	/* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+	   the code:
+
+	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+	   with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
+
+	rldicl	r7,r3,0,52
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r7,4096-32
+	bgt	cr7,L(pagecross_check)
+	cmpldi	cr5,r9,4096-32
+	bgt	cr5,L(pagecross_check)
+
+	/* For short string up to 32 bytes, load both s1 and s2 using
+	   unaligned dwords and compare.  */
+	ld	r8,0(r3)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,8(r3)
+	ld	r10,8(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,16(r3)
+	ld	r10,16(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,24(r3)
+	ld	r10,24(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	addi	r7,r3,32
+	addi	r4,r4,32
+
+L(align_8b):
+	/* Now it has checked for first 32 bytes, align source1 to doubleword
+	   and adjust source2 address.  */
+	rldicl	r9,r7,0,61	/* source1 alignment to doubleword  */
+	subf	r4,r9,r4	/* Adjust source2 address based on source1
+				   alignment.  */
+	rldicr	r7,r7,0,60	/* Align source1 to doubleword.  */
+
+	/* At this point, source1 alignment is 0 and source2 alignment is
+	   between 0 and 7.  Check is source2 alignment is 0, meaning both
+	   sources have the same alignment.  */
+	andi.	r9,r4,0x7
+	bne	cr0,L(loop_diff_align)
+
+	/* If both source1 and source2 are doubleword aligned, there is no
+	   need for page boundary cross checks.  */
+
+	ld	r8,0(r7)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	.align 4
+L(loop_equal_align):
+	ld	r8,8(r7)
+	ld	r10,8(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,16(r7)
+	ld	r10,16(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ldu	r8,24(r7)
+	ldu	r10,24(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	b	L(loop_equal_align)
+
+	/* A zero byte was found in r8 (s1 dword), r9 contains the cmpb
+	   result and r10 the dword from s2.  To code isolate the byte
+	   up to end (including the '\0'), masking with 0xFF the remaining
+	   ones:
+
+           #if __LITTLE_ENDIAN__
+	     (__builtin_ffsl (x) - 1) = counting trailing zero bits
+	     r9 = (__builtin_ffsl (r9) - 1) + 8;
+	     r9 = -1UL << r9
+	   #else
+	     r9  = __builtin_clzl (r9) + 8;
+	     r9  = -1UL >> r9
+	   #endif
+	     r8  = r8  | r9
+	     r10 = r10 | r9  */
+
+#ifdef __LITTLE_ENDIAN__
+	nor 	r9,r9,r9
+L(different_nocmpb):
+	neg	r3,r9
+	and	r9,r9,r3
+	cntlzd	r9,r9
+	subfic	r9,r9,63
+#else
+	not	r9,r9
+L(different_nocmpb):
+	cntlzd	r9,r9
+	subfic	r9,r9,56
+#endif
+	srd	r3,r8,r9
+	srd	r10,r10,r9
+	rldicl	r10,r10,0,56
+	rldicl	r3,r3,0,56
+	subf	r3,r10,r3
+	extsw	r3,r3
+	blr
+
+	.align	4
+L(pagecross_check):
+	subfic	r9,r9,4096
+	subfic	r7,r7,4096
+	cmpld	cr7,r7,r9
+	bge	cr7,L(pagecross)
+	mr	r7,r9
+
+	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
+	   a simple byte a byte comparison until the page alignment for s1
+	   is reached.  */
+L(pagecross):
+	add	r7,r3,r7
+	subf	r9,r3,r7
+	mtctr	r9
+
+	.align	4
+L(pagecross_loop):
+	/* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
+	   and if *s1 is '\0'.  */
+	lbz	r9,0(r3)
+	lbz	r10,0(r4)
+	addi	r3,r3,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,r10
+	cmpdi	cr5,r9,r0
+	bne	cr7,L(pagecross_ne)
+	beq	cr5,L(pagecross_nullfound)
+	bdnz	L(pagecross_loop)
+	b	L(align_8b)
+
+	.align	4
+	/* The unaligned read of source2 will cross a 4K page boundary,
+	   and the different byte or NULL maybe be in the remaining page
+	   bytes. Since it can not use the unaligned load, the algorithm
+	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
+L(check_source2_byte):
+	li	r9,8
+	mtctr	r9
+
+	.align	4
+L(check_source2_byte_loop):
+	lbz	r9,0(r7)
+	lbz	r10,0(r4)
+	addi	r7,r7,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,10
+	cmpdi	r5,r9,0
+	bne	cr7,L(pagecross_ne)
+	beq	cr5,L(pagecross_nullfound)
+	bdnz	L(check_source2_byte_loop)
+
+	/* If source2 is unaligned to doubleword, the code needs to check
+	   on each interation if the unaligned doubleword access will cross
+	   a 4k page boundary.  */
+	.align	5
+L(loop_unaligned):
+	ld	r8,0(r7)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+	addi	r7,r7,8
+	addi	r4,r4,8
+
+L(loop_diff_align):
+	/* Check if [src2]+8 cross a 4k page boundary:
+
+	     srcin2 % PAGE_SIZE > (PAGE_SIZE - 8)
+
+	     with PAGE_SIZE being 4096.  */
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r9,4088
+	ble	cr7,L(loop_unaligned)
+	b	L(check_source2_byte)
+
+	.align	4
+L(pagecross_ne):
+	extsw	r3,r9
+	mr	r9,r10
+L(pagecross_retdiff):
+	subf	r9,r9,r3
+	extsw	r3,r9
+	blr
+
+	.align	4
+L(pagecross_nullfound):
+	li	r3,0
+	b	L(pagecross_retdiff)
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S
new file mode 100644
index 0000000000..d3e9a101c5
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -0,0 +1,262 @@
+/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPCPY
+# define FUNC_NAME __stpcpy
+#else
+# define FUNC_NAME strcpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+   or
+
+   char * [r3] stpcpy (char *dest [r3], const char *src [r4])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+        li      r0,0          /* Doubleword with null chars to use
+                                 with cmpb.  */
+
+	/* Check if the [src]+15 will cross a 4K page by checking if the bit
+	   indicating the page size changes.  Basically:
+
+	   uint64_t srcin = (uint64_t)src;
+	   uint64_t ob = srcin & 4096UL;
+	   uint64_t nb = (srcin+15UL) & 4096UL;
+	   if (ob ^ nb)
+	     goto pagecross;  */
+
+	addi	r9,r4,15
+	xor	r9,r9,r4
+	rlwinm.	r9,r9,0,19,19
+	bne	L(pagecross)
+
+	/* For short string (less than 16 bytes), just calculate its size as
+	   strlen and issues a memcpy if null is found.  */
+	mr	r7,r4
+        ld      r12,0(r7)     /* Load doubleword from memory.  */
+        cmpb    r10,r12,r0    /* Check for null bytes in DWORD1.  */
+        cmpdi   cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+        bne     cr7,L(done)
+
+        ldu     r8,8(r7)
+        cmpb    r10,r8,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+	b	L(loop_before)
+
+	.align	4
+L(pagecross):
+	clrrdi  r7,r4,3       /* Align the address to doubleword boundary.  */
+	rlwinm  r6,r4,3,26,28 /* Calculate padding.  */
+	li      r5,-1         /* MASK = 0xffffffffffffffff.  */
+        ld      r12,0(r7)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+        sld     r5,r5,r6
+#else
+        srd     r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+        orc     r9,r12,r5     /* Mask bits that are not part of the string.  */
+        cmpb    r10,r9,r0     /* Check for null bytes in DWORD1.  */
+        cmpdi   cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+        bne     cr7,L(done)
+
+        ldu     r6,8(r7)
+        cmpb    r10,r6,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+        ld      r12,0(r7)
+        cmpb    r10,r12,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+        ldu     r6,8(r7)
+        cmpb    r10,r6,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+	/* We checked for 24 - x bytes, with x being the source alignment
+	   (0 <= x <= 16), and no zero has been found.  Start the loop
+	   copy with doubleword aligned address.  */
+	mr	r7,r4
+	ld	r12, 0(r7)
+	ldu	r8, 8(r7)
+
+L(loop_before):
+	/* Save the two doublewords readed from source and align the source
+	   to 16 bytes for the loop.  */
+	mr	r11,r3
+	std	r12,0(r11)
+	std	r8,8(r11)
+	addi	r11,r11,16
+	rldicl	r9,r4,0,60
+	subf	r7,r9,r7
+	subf	r11,r9,r11
+	b	L(loop_start)
+
+        .align  5
+L(loop):
+        std     r12, 0(r11)
+        std     r6, 8(r11)
+	addi	r11,r11,16
+L(loop_start):
+        /* Load two doublewords, compare and merge in a
+           single register for speed.  This is an attempt
+           to speed up the null-checking process for bigger strings.  */
+
+        ld      r12, 8(r7)
+        ldu     r6, 16(r7)
+        cmpb    r10,r12,r0
+        cmpb    r9,r6,r0
+        or      r8,r9,r10     /* Merge everything in one doubleword.  */
+        cmpdi   cr7,r8,0
+        beq     cr7,L(loop)
+
+
+        /* OK, one (or both) of the doublewords contains a null byte.  Check
+           the first doubleword and decrement the address in case the first
+           doubleword really contains a null byte.  */
+
+	addi	r4,r7,-8
+        cmpdi   cr6,r10,0
+        addi    r7,r7,-8
+        bne     cr6,L(done2)
+
+        /* The null byte must be in the second doubleword.  Adjust the address
+           again and move the result of cmpb to r10 so we can calculate the
+           length.  */
+
+        mr      r10,r9
+        addi    r7,r7,8
+	b	L(done2)
+
+        /* r10 has the output of the cmpb instruction, that is, it contains
+           0xff in the same position as the null byte in the original
+           doubleword from the string.  Use that to calculate the length.  */
+L(done):
+	mr	r11,r3
+L(done2):
+#ifdef __LITTLE_ENDIAN__
+        addi    r9, r10, -1   /* Form a mask from trailing zeros.  */
+        andc    r9, r9, r10
+        popcntd r6, r9        /* Count the bits in the mask.  */
+#else
+        cntlzd  r6,r10        /* Count leading zeros before the match.  */
+#endif
+        subf    r5,r4,r7
+        srdi    r6,r6,3       /* Convert leading/trailing zeros to bytes.  */
+        add     r8,r5,r6      /* Compute final length.  */
+#ifdef USE_AS_STPCPY
+	/* stpcpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
+	addi	r8,r8,1       /* Final '/0'.  */
+
+	cmpldi	cr6,r8,8
+	mtocrf	0x01,r8
+	ble	cr6,L(copy_LE_8)
+
+	cmpldi	cr1,r8,16
+	blt	cr1,8f
+
+	/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	/* At least 6 bytes to go.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	ld	r6,0(r4)
+	ld	r8,8(r4)
+	addi	r4,r4,16
+	std	r6,0(r11)
+	std	r8,8(r11)
+	addi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	ld	r6,0(r4)
+	addi	r4,r4,8
+	std	r6,0(r11)
+	addi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	r6,0(r4)
+	stw	r6,0(r11)
+	bf	30,L(tail5)
+	lhz	r7,4(r4)
+	sth	r7,4(r11)
+	bflr	31
+	lbz	r8,6(r4)
+	stb	r8,6(r11)
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	r6,0(r4)
+	sth	r6,0(r11)
+	bflr	31
+	lbz	r7,2(r4)
+	stb	r7,2(r11)
+	blr
+
+	.align	4
+L(tail5):
+	bf	31,1f
+	lbz	r6,4(r4)
+	stb	r6,4(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	r6,0(r4)
+	stb	r6,0(r11)
+	blr
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+	ld	r6,0(r4)
+	std	r6,0(r11)
+	blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strncmp.S b/sysdeps/powerpc/powerpc64/power8/strncmp.S
new file mode 100644
index 0000000000..56c814b88c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncmp.S
@@ -0,0 +1,323 @@
+/* Optimized strncmp implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (strncmp, 4, 0)
+	/* Check if size is 0.  */
+	mr.	r10,r5
+	beq	cr0,L(ret0)
+
+	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
+	   the code:
+
+	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
+	rldicl	r8,r3,0,52
+	cmpldi	cr7,r8,4096-16
+	bgt	cr7,L(pagecross)
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r9,4096-16
+	bgt	cr7,L(pagecross)
+
+	/* For short string up to 16 bytes, load both s1 and s2 using
+	   unaligned dwords and compare.  */
+	ld	r7,0(r3)
+	ld	r9,0(r4)
+	li	r8,0
+	cmpb	r8,r7,r8
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different1)
+
+	/* If the string compared are equal, but size is less or equal
+	   to 8, return 0.  */
+	cmpldi	cr7,r10,8
+	li	r9,0
+	ble	cr7,L(ret1)
+	addi	r5,r10,-8
+
+	ld	r7,8(r3)
+	ld	r9,8(r4)
+	cmpb	r8,r7,r8
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different0)
+
+	cmpldi	cr7,r5,8
+	mr	r9,r8
+	ble	cr7,L(ret1)
+
+	/* Update pointers and size.  */
+	addi	r10,r10,-16
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	/* Now it has checked for first 16 bytes, align source1 to doubleword
+	   and adjust source2 address.  */
+L(align_8b):
+	rldicl	r5,r3,0,61
+	rldicr	r3,r3,0,60
+	subf	r4,r5,r4
+	add	r10,r10,r5
+
+	/* At this point, source1 alignment is 0 and source2 alignment is
+	   between 0 and 7.  Check is source2 alignment is 0, meaning both
+	   sources have the same alignment.  */
+	andi.	r8,r4,0x7
+	beq	cr0,L(loop_eq_align_0)
+
+	li	r5,0
+	b	L(loop_ne_align_1)
+
+	/* If source2 is unaligned to doubleword, the code needs to check
+	   on each interation if the unaligned doubleword access will cross
+	   a 4k page boundary.  */
+	.align 4
+L(loop_ne_align_0):
+	ld	r7,0(r3)
+	ld	r9,0(r4)
+	cmpb	r8,r7,r5
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different1)
+
+	cmpldi	cr7,r10,8
+	ble	cr7,L(ret0)
+	addi	r10,r10,-8
+	addi	r3,r3,8
+	addi	r4,r4,8
+L(loop_ne_align_1):
+	rldicl	r9,r4,0,52
+	cmpldi	r7,r9,4088
+	ble	cr7,L(loop_ne_align_0)
+	cmpdi	cr7,r10,0
+	beq	cr7,L(ret0)
+
+	lbz	r9,0(r3)
+	lbz	r8,0(r4)
+	cmplw	cr7,r9,r8
+	bne	cr7,L(byte_ne_4)
+	cmpdi	cr7,r9,0
+	beq	cr7,L(size_reached_0)
+
+	li	r9,r7
+	addi	r8,r3,1
+	mtctr	r9
+	addi	r4,r4,1
+	addi	r10,r10,-1
+	addi	r3,r3,8
+
+	/* The unaligned read of source2 will cross a 4K page boundary,
+	   and the different byte or NULL maybe be in the remaining page
+	   bytes.  Since it can not use the unaligned load the algorithm
+	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
+	.align 4
+L(loop_ne_align_byte):
+	cmpdi	cr7,r10,0
+	addi	r10,r10,-1
+	beq	cr7,L(ret0)
+	lbz	r9,0(r8)
+	lbz	r7,0(r4)
+	addi	r8,r8,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,r7
+	cmpdi	cr5,r9,0
+	bne	cr7,L(size_reached_2)
+	beq	cr5,L(size_reached_0)
+	bdnz	L(loop_ne_align_byte)
+
+	cmpdi	cr7,r10,0
+	bne+	cr7,L(loop_ne_align_0)
+
+	.align 4
+L(ret0):
+	li	r9,0
+L(ret1):
+	mr	r3,r9
+	blr
+
+	/* The code now check if r8 and r10 are different by issuing a
+	   cmpb and shift the result based on its output:
+
+	#ifdef __LITTLE_ENDIAN__
+	  leadzero = (__builtin_ffsl (z1) - 1);
+	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+	  r1 = (r1 >> leadzero) & 0xFFUL;
+	  r2 = (r2 >> leadzero) & 0xFFUL;
+	#else
+	  leadzero = __builtin_clzl (z1);
+	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+	  r1 = (r1 >> (56 - leadzero)) & 0xFFUL;
+	  r2 = (r2 >> (56 - leadzero)) & 0xFFUL;
+	#endif
+	  return r1 - r2;  */
+
+	.align 4
+L(different0):
+	mr	r10,r5
+#ifdef __LITTLE_ENDIAN__
+L(different1):
+        neg	r11,r8
+        sldi	r10,r10,3
+        and	r8,r11,r8
+        addi	r10,r10,-8
+        cntlzd	r8,r8
+        subfic	r8,r8,63
+        extsw 	r8,r8
+        cmpld	cr7,r8,r10
+        ble	cr7,L(different2)
+        mr	r8,r10
+L(different2):
+        extsw	r8,r8
+#else
+L(different1):
+	addi	r10,r10,-1
+	cntlzd	r8,r8
+	sldi	r10,r10,3
+	cmpld	cr7,r8,r10
+	blt	cr7,L(different2)
+	mr	r8,r10
+L(different2):
+	subfic	r8,r8,56
+#endif
+	srd	r7,r7,r8
+	srd	r9,r9,r8
+	rldicl	r3,r7,0,56
+	rldicl	r9,r9,0,56
+	subf	r9,r9,3
+	extsw	r9,r9
+	mr	r3,r9
+	blr
+
+	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
+	   a simple byte a byte comparison until the page alignment for s1
+	   is reached.  */
+	.align 4
+L(pagecross):
+	lbz	r7,0(r3)
+	lbz	r9,0(r4)
+	subfic	r8,r8,4095
+	cmplw	cr7,r9,r7
+	bne	cr7,L(byte_ne_3)
+	cmpdi	cr7,r9,0
+	beq	cr7,L(byte_ne_0)
+	addi	r10,r10,-1
+	subf	r7,r8,r10
+	subf	r9,r7,r10
+	addi	r9,r9,1
+	mtctr	r9
+	b	L(pagecross_loop1)
+
+	.align 4
+L(pagecross_loop0):
+	beq	cr7,L(ret0)
+	lbz	r9,0(r3)
+	lbz	r8,0(r4)
+	addi	r10,r10,-1
+	cmplw	cr7,r9,r8
+	cmpdi	cr5,r9,0
+	bne	r7,L(byte_ne_2)
+	beq	r5,L(byte_ne_0)
+L(pagecross_loop1):
+	cmpdi	cr7,r10,0
+	addi	r3,r3,1
+	addi	r4,r4,1
+	bdnz	L(pagecross_loop0)
+	cmpdi	cr7,r7,0
+	li	r9,0
+	bne+	cr7,L(align_8b)
+	b	L(ret1)
+
+	/* If both source1 and source2 are doubleword aligned, there is no
+	   need for page boundary cross checks.  */
+	.align 4
+L(loop_eq_align_0):
+	ld	r7,0(r3)
+	ld	r9,0(r4)
+	cmpb	r8,r7,r8
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different1)
+
+	cmpldi	cr7,r10,8
+	ble	cr7,L(ret0)
+	addi	r9,r10,-9
+
+	li	r5,0
+	srdi	r9,r9,3
+	addi	r9,r9,1
+	mtctr	r9
+	b	L(loop_eq_align_2)
+
+	.align 4
+L(loop_eq_align_1):
+	bdz	L(ret0)
+L(loop_eq_align_2):
+	ldu	r7,8(r3)
+	addi	r10,r10,-8
+	ldu	r9,8(r4)
+	cmpb	r8,r7,r5
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	beq	cr0,L(loop_eq_align_1)
+	b	L(different1)
+
+	.align 4
+L(byte_ne_0):
+	li	r7,0
+L(byte_ne_1):
+	subf	r9,r9,r7
+	extsw	r9,r9
+	b	L(ret1)
+
+	.align 4
+L(byte_ne_2):
+	extsw	r7,r9
+	mr	r9,r8
+	b	L(byte_ne_1)
+L(size_reached_0):
+	li	r10,0
+L(size_reached_1):
+	subf	r9,r9,r10
+	extsw	r9,r9
+	b	L(ret1)
+L(size_reached_2):
+	extsw	r10,r9
+	mr	r9,r7
+	b	L(size_reached_1)
+L(byte_ne_3):
+	extsw	r7,r7
+	b	L(byte_ne_1)
+L(byte_ne_4):
+	extsw	r10,r9
+	mr	r9,r8
+	b	L(size_reached_1)
+END(strncmp)
+libc_hidden_builtin_def(strncmp)
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644
index 0000000000..5fda953526
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -0,0 +1,424 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+
+        /* Check if the [src]+15 will cross a 4K page by checking if the bit
+           indicating the page size changes.  Basically:
+
+           uint64_t srcin = (uint64_t)src;
+           uint64_t ob = srcin & 4096UL;
+           uint64_t nb = (srcin+15UL) & 4096UL;
+           if (ob ^ nb)
+             goto pagecross;  */
+
+	addi	r10,r4,16
+	rlwinm	r9,r4,0,19,19
+
+	/* Since it is a leaf function, save some non-volatile registers on the
+	   protected/red zone.  */
+	std	r26,-48(r1)
+	std	r27,-40(r1)
+
+	rlwinm	r8,r10,0,19,19
+
+	std	r28,-32(r1)
+	std	r29,-24(r1)
+
+	cmpld	r7,r9,r8
+
+	std	r30,-16(r1)
+	std	r31,-8(r1)
+
+	beq	cr7,L(unaligned_lt_16)
+	rldicl	r9,r4,0,61
+	subfic	r8,r9,8
+	cmpld	cr7,r5,r8
+	bgt 	cr7,L(pagecross)
+
+	/* At this points there is 1 to 15 bytes to check and write.  Since it could
+	   be either from first unaligned 16 bytes access or from bulk copy, the code
+	   uses an unrolled byte read/write instead of trying to analyze the cmpb
+	   results.  */
+L(short_path):
+	mr	r9,r3
+L(short_path_1):
+	cmpdi	cr7,r5,0
+	beq	cr7,L(short_path_loop_end_1)
+L(short_path_2):
+	lbz	r10,0(r4)
+	cmpdi	cr7,r10,0
+	stb	r10,0(r9)
+	beq	cr7,L(zero_pad_start_1)
+	cmpdi	cr0,r5,1
+	addi	r8,r9,1
+	addi	r6,r5,-1
+	beq	cr0,L(short_path_loop_end_0)
+	lbz	r10,1(r4)
+	cmpdi	cr7,r10,0
+	stb	r10,1(r9)
+	beq	cr7,L(zero_pad_start_prepare_1)
+	addi	r10,r5,-3
+	b	L(short_path_loop_1)
+
+	.align	4
+L(short_path_loop):
+	lbz	r8,0(r4)
+	addi	r7,r10,-2
+	cmpdi	cr5,r8,0
+	stb	r8,0(r9)
+	beq	cr5,L(zero_pad_start_1)
+	beq	r7,L(short_path_loop_end_0)
+	lbz	r8,1(r4)
+	cmpdi	cr7,r8,0
+	stb	r8,1(r9)
+	beq	cr7,L(zero_pad_start)
+	mr	r10,r7
+L(short_path_loop_1):
+	addic.	r5,r5,-2
+	addi	r9,r9,2
+	cmpdi	cr7,r10,0
+	addi	r4,r4,2
+	addi	r6,r9,1
+	bne	cr0,L(short_path_loop)
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+	b	L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+	addi	r3,r9,1
+	b	L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+#endif
+L(short_path_loop_end):
+	/* Restore non-volatile registers.  */
+	ld	r26,-48(r1)
+	ld	r27,-40(r1)
+	ld	r28,-32(r1)
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+	/* This code pads the remainder dest with NULL bytes.  The algorithm
+	   calculate the remanining size and issues a doubleword unrolled
+	   loops followed by a byte a byte set.  */
+	.align	4
+L(zero_pad_start):
+	mr	r5,r10
+	mr	r9,r6
+L(zero_pad_start_1):
+	srdi.	r8,r5,r3
+	mr	r10,r9
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+#endif
+	beq-	cr0,L(zero_pad_loop_b_start)
+	cmpldi	cr7,r8,1
+	li	cr7,0
+	std	r7,0(r9)
+	beq	cr7,L(zero_pad_loop_b_prepare)
+	addic.	r8,r8,-2
+	addi	r10,r9,r16
+	std	r7,8(r9)
+	beq	cr0,L(zero_pad_loop_dw_2)
+	std	r7,16(r9)
+	li	r9,0
+	b	L(zero_pad_loop_dw_1)
+
+	.align	4
+L(zero_pad_loop_dw):
+	addi	r10,r10,16
+	std	r9,-8(r10)
+	beq	cr0,L(zero_pad_loop_dw_2)
+	std	r9,0(r10)
+L(zero_pad_loop_dw_1):
+	cmpldi	cr7,r8,1
+	std	r9,0(r10)
+	addic.	r8,r8,-2
+	bne	cr7,L(zero_pad_loop_dw)
+	addi	r10,r10,8
+L(zero_pad_loop_dw_2):
+	rldicl	r5,r5,0,61
+L(zero_pad_loop_b_start):
+	cmpdi	cr7,r5,0
+	addi	r5,r5,-1
+	addi	r9,r10,-1
+	add	r10,r10,5
+	subf	r10,r9,r10
+	li	r8,0
+	beq-	cr7,L(short_path_loop_end)
+
+	/* Write remaining 1-8 bytes.  */
+        .align  4
+	addi	r9,r9,1
+	mtocrf	0x1,r10
+	bf	29,4f
+        stw     r8,0(r9)
+        addi	r9,r9,4
+
+        .align  4
+4:      bf      30,2f
+        sth     r8,0(r9)
+        addi	r9,r9,2
+
+        .align  4
+2:      bf	31,1f
+        stb	r8,0(r9)
+
+	/* Restore non-volatile registers.  */
+1:	ld	r26,-48(r1)
+	ld	r27,-40(r1)
+	ld	r28,-32(r1)
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+	/* The common case where [src]+16 will not cross a 4K page boundary.
+	   In this case the code fast check the first 16 bytes by using doubleword
+	   read/compares and update destiny if neither total size or null byte
+	   is found in destiny. */
+	.align	4
+L(unaligned_lt_16):
+	cmpldi	cr7,r5,7
+	ble	cr7,L(short_path)
+	ld	r7,0(r4)
+	li	r8,0
+	cmpb	r8,r7,r8
+	cmpdi	cr7,r8,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r6,r5,-8
+	std	r7,0(r3)
+	addi	r9,r3,r8
+	cmpldi	cr7,r6,7
+	addi	r7,r4,8
+	ble	cr7,L(short_path_prepare_1_1)
+	ld	r4,8(r4)
+	cmpb	r8,r4,r8
+	cmpdi	cr7,r8,0
+	bne	cr7,L(short_path_prepare_2_1)
+	std	r4,8(r3)
+	addi	r29,r3,16
+	addi	r5,r5,-16
+	/* Neither the null byte was found or total length was reached,
+	   align to 16 bytes and issue a bulk copy/compare.  */
+	b	L(align_to_16b)
+
+	/* In the case of 4k page boundary cross, the algorithm first align
+	   the address to a doubleword, calculate a mask based on alignment
+	   to ignore the bytes and continue using doubleword.  */
+	.align	4
+L(pagecross):
+	rldicr	r11,r4,0,59	/* Align the address to 8 bytes boundary.  */
+	li	r6,-1		/* MASK = 0xffffffffffffffffUL.  */
+	sldi	r9,r9,3		/* Calculate padding.  */
+	ld	r7,0(r11)	/* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r9,r6,r9	/* MASK = MASK << padding.  */
+#else
+	srd	r9,r6,r9	/* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r7,r9	/* Mask bits that are not part of the
+				   string.  */
+	li	cr7,0
+	cmpb	r9,r9,r7	/* Check for null bytes in DWORD1.  */
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	subf	r8,r8,r5	/* Adjust total length.  */
+	cmpldi	cr7,r8,8	/* Check if length was reached.  */
+	ble	cr7,L(short_path_prepare_2)
+
+	/* For next checks we have aligned address, so we check for more
+	   three doublewords to make sure we can read 16 unaligned bytes
+	   to start the bulk copy with 16 aligned addresses.  */
+	ld	cr7,8(r11)
+	cmpb	r9,r7,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	cr7,r8,-8
+	cmpldi	cr7,r7,8
+	ble	cr7,L(short_path_prepare_2)
+	ld	cr7,16(r11)
+	cmpb	r9,r7,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r8,r8,-16
+	cmpldi	r7,r8,8
+	ble	cr7,L(short_path_prepare_2)
+	ld	r8,24(r11)
+	cmpb	r9,r8,r9
+	cmpdi	r7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+
+	/* No null byte found in the 32 bytes readed and length not reached,
+	   read source again using unaligned loads and store them.  */
+	ld	r9,0(r4)
+	addi	r29,r3,16
+	addi	r5,r5,-16
+	std	r9,0(r3)
+	ld	r9,8(r4)
+	std	r9,8(r3)
+
+	/* Align source to 16 bytes and adjust destiny and size.  */
+L(align_to_16b):
+	rldicl	r9,r10,0,60
+	rldicr	r28,r10,0,59
+	add	r12,r5,r9
+	subf	r29,r9,r29
+
+	/* The bulk read/compare/copy loads two doublewords, compare and merge
+	   in a single register for speed.  This is an attempt to speed up the
+	   null-checking process for bigger strings.  */
+
+	cmpldi	cr7,r12,15
+	ble	cr7,L(short_path_prepare_1_2)
+
+	/* Main loop for large sizes, unrolled 2 times to get better use of
+	   pipeline.  */
+	ld	r8,0(28)
+	ld	r10,8(28)
+	li	r9,0
+	cmpb	r7,r8,r9
+	cmpb	r9,r10,r9
+	or.	r6,r9,r7
+	bne	cr0,L(short_path_prepare_2_3)
+	addi	r5,r12,-16
+	addi	r4,r28,16
+	std	r8,0(r29)
+	std	r10,8(r29)
+	cmpldi	cr7,r5,15
+	addi	r9,r29,16
+	ble	cr7,L(short_path_1)
+	mr	r11,r28
+	mr	r6,r29
+	li	r30,0
+	subfic	r26,r4,48
+	subfic	r27,r9,48
+
+	b	L(loop_16b)
+
+	.align	4
+L(loop_start):
+	ld	r31,0(r11)
+	ld	r10,8(r11)
+	cmpb	r0,r31,r7
+	cmpb	r8,r10,r7
+	or.	r7,r0,r8
+	addi	r5,r5,-32
+	cmpldi	cr7,r5,15
+	add	r4,r4,r26
+	add	r9,r9,r27
+	bne	cr0,L(short_path_prepare_2_2)
+	add	r4,r28,r4
+	std	r31,0(r6)
+	add	r9,r29,r9
+	std	r10,8(r6)
+	ble	cr7,L(short_path_1)
+
+L(loop_16b):
+	ld	r10,16(r11)
+	ld	r0,24(r11)
+	cmpb	r8,r10,r30
+	cmpb	r7,r0,r30
+	or.	r7,r8,r7
+	addi	r12,r12,-32
+	cmpldi	r7,r12,15
+	addi	r11,r11,32
+	bne	cr0,L(short_path_2)
+	std	r10,16(r6)
+	addi	r6,r6,32
+	std	r0,-8(r6)
+	bgt	cr7,L(loop_start)
+
+	mr	r5,r12
+	mr	r4,r11
+	mr	r9,r6
+	b	L(short_path_1)
+
+	.align	4
+L(short_path_prepare_1_1):
+	mr	r5,r6
+	mr	r4,r7
+	b	L(short_path_1)
+L(short_path_prepare_1_2):
+	mr	r5,r12
+	mr	r4,r28
+	mr	r9,r29
+	b	L(short_path_1)
+L(short_path_prepare_2):
+	mr	r9,r3
+	b	L(short_path_2)
+L(short_path_prepare_2_1):
+	mr	r5,r6
+	mr	r4,r7
+	b	L(short_path_2)
+L(short_path_prepare_2_2):
+	mr	r5,r12
+	mr	r4,r11
+	mr	r9,r6
+	b	L(short_path_2)
+L(short_path_prepare_2_3):
+	mr	r5,r12
+	mr	r4,r28
+	mr	r9,r29
+	b	L(short_path_2)
+L(zero_pad_loop_b_prepare):
+	addi	r10,r9,8
+	rldicl	r5,r5,0,61
+	b	L(zero_pad_loop_b_start)
+L(zero_pad_start_prepare_1):
+	mr	r5,r6
+	mr	r9,r8
+	b	L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifdef USE_AS_STPNCPY
+libc_hidden_def (__stpncpy)
+#else
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strcspn.S b/sysdeps/powerpc/powerpc64/strcspn.S
index 3f6aa0a506..1121930b15 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcspn.S
+++ b/sysdeps/powerpc/powerpc64/strcspn.S
@@ -20,54 +20,42 @@
 
 /* size_t [r3] strcspn (const char [r4] *s, const char [r5] *reject)  */
 
-	.machine power7
 EALIGN (strcspn, 4, 0)
 	CALL_MCOUNT 3
 
 	/* The idea to speed up the algorithm is to create a lookup table
 	   for fast check if input character should be considered.  For ASCII
 	   or ISO-8859-X character sets it has 256 positions.  */
-	lbz	r10,0(r4)
-
-	/* First the table should be cleared and to avoid unaligned accesses
-	   when using the VSX stores the table address is aligned to 16
-	   bytes.  */
-	xxlxor	v0,v0,v0
 
 	/* PPC64 ELF ABI stack is aligned to 16 bytes.  */
 	addi 	r9,r1,-256
+	/* Clear the table with 0 values  */
+	li	r6, 0
+	li	r8, 4
+	mtctr	r8
+	mr	r10, r9
+	.align 	4
+L(zerohash):
+	std	r6, 0(r10)
+	std	r6, 8(r10)
+	std	r6, 16(r10)
+	std	r6, 24(r10)
+	std	r6, 32(r10)
+	std	r6, 40(r10)
+	std	r6, 48(r10)
+	std	r6, 56(r10)
+	addi	r10, r10, 64
+	bdnz	L(zerohash)
 
-	li	r8,48
-	li	r5,16
-	li	r6,32
+	lbz	r10,0(r4)
 	cmpdi	cr7,r10,0	/* reject[0] == '\0' ?  */
-	addi	r12,r9,64
-	/* Clear the table with 0 values  */
-	stxvw4x	v0,r0,r9
-	addi	r11,r9,128
-	addi	r7,r9,192
-	stxvw4x v0,r9,r5
-	stxvw4x v0,r9,r6
-	stxvw4x v0,r9,r8
-	stxvw4x v0,r0,r12
-	stxvw4x v0,r12,r5
-	stxvw4x v0,r12,r6
-	stxvw4x v0,r12,r8
-	stxvw4x v0,r0,r11
-	stxvw4x v0,r11,r5
-	stxvw4x v0,r11,r6
-	stxvw4x v0,r11,r8
-	stxvw4x v0,r0,r7
-	stxvw4x v0,r7,r5
-	stxvw4x v0,r7,r6
-	stxvw4x v0,r7,r8
 	li	r8,1
 	beq     cr7,L(finish_table)  /* If reject[0] == '\0' skip  */
 
 	/* Initialize the table as:
 	   for (i=0; reject[i]; i++
 	     table[reject[i]]] = 1  */
-	.p2align 4,,15
+	.align	4
 L(init_table):
 	stbx	r8,r9,r10
 	lbzu	r10,1(r4)
@@ -93,7 +81,7 @@ L(finish_table):
 	       if (table[input[i++]] == 1)
 	         return i - 1;
 	     }  */
-	.p2align 4,,15
+	.align 4
 L(unroll):
 	lbz	r8,1(r3)
 	addi	r10,r10,4
@@ -121,17 +109,17 @@ L(mainloop):
 	mr	r3,r10
 	blr
 
-	.p2align 4,,15
+	.align 4
 L(end):
 	mr	r3,r6
 	blr
 
-	.p2align 4,,15
+	.align 4
 L(end2):
 	mr	r3,r4
 	blr
 
-	.p2align 4,,15
+	.align 4
 L(end3):
 	mr	r3,r5
 	blr
diff --git a/sysdeps/powerpc/powerpc64/power7/strpbrk.S b/sysdeps/powerpc/powerpc64/strpbrk.S
index d6204a7754..6b2ad4d1aa 100644
--- a/sysdeps/powerpc/powerpc64/power7/strpbrk.S
+++ b/sysdeps/powerpc/powerpc64/strpbrk.S
@@ -1,4 +1,4 @@
-/* Optimized strpbrk implementation for PowerPC64/POWER7.
+/* Optimized strpbrk implementation for PowerPC64.
    Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -20,7 +20,6 @@
 
 /* char [r3] *strpbrk(const char [r4] *s, const char [r5] *accept)  */
 
-	.machine power7
 EALIGN (strpbrk, 4, 0)
 	CALL_MCOUNT 3
 
@@ -32,43 +31,31 @@ EALIGN (strpbrk, 4, 0)
 	   for fast check if input character should be considered.  For ASCII
 	   or ISO-8859-X character sets it has 256 positions.  */
 
-	/* First the table should be cleared and to avoid unaligned accesses
-	   when using the VSX stores the table address is aligned to 16
-	   bytes.  */
-	xxlxor	v0,v0,v0
-
-	/* PPC64 ELF ABI stack is aligned to 16 bytes  */
+	/* PPC64 ELF ABI stack is aligned to 16 bytes.  */
 	addi 	r9,r1,-256
-
-	li	r5,16
-	li	r6,32
-	li	r8,48
-	addi	r12,r9,64
 	/* Clear the table with 0 values  */
-	stxvw4x v0,r0,r9
-	addi	r11,r9,128
-	addi	r7,r9,192
-	stxvw4x	v0,r9,r5
-	li	r0,1
-	stxvw4x v0,r9,r6
-	stxvw4x v0,r9,r8
-	stxvw4x v0,r0,r12
-	stxvw4x v0,r12,r5
-	stxvw4x v0,r12,r6
-	stxvw4x v0,r12,r8
-	stxvw4x v0,r0,r11
-	stxvw4x v0,r11,r5
-	stxvw4x v0,r11,r6
-	stxvw4x v0,r11,r8
-	stxvw4x v0,r0,r7
-	stxvw4x v0,r7,r5
-	stxvw4x v0,r7,r6
-	stxvw4x v0,r7,r8
+	li	r6, 0
+	li	r7, 4
+	mtctr	r7
+	mr	r8, r9
+	.align 	4
+L(zerohash):
+	std	r6, 0(r8)
+	std	r6, 8(r8)
+	std	r6, 16(r8)
+	std	r6, 24(r8)
+	std	r6, 32(r8)
+	std	r6, 40(r8)
+	std	r6, 48(r8)
+	std	r6, 56(r8)
+	addi	r8, r8, 64
+	bdnz	L(zerohash)
 
 	/* Initialize the table as:
 	   for (i=0; accept[i]; i++
 	     table[accept[i]]] = 1  */
-	.p2align 4,,15
+	li      r0,1
+	.align 4
 L(init_table):
 	stbx	r0,r9,r10
 	lbzu	r10,1(r4)
@@ -93,7 +80,7 @@ L(finish_table):
 	       if (table[input[i++]] == 1)
 	         return (s[i -1] ? s + i - 1: NULL);
 	     }  */
-	.p2align 4
+	.align 4
 L(unroll):
 	lbz	r0,1(r3)
 	lbzx	r8,r9,r0
@@ -121,7 +108,7 @@ L(mainloop):
 L(end):
 	blr
 
-	.p2align 4
+	.align 4
 L(checkend):
 	cmpdi	cr1,r12,0
 	mr	r3,r7
@@ -131,14 +118,14 @@ L(nullfound):
 	li 3,0
 	blr
 
-	.p2align 4
+	.align 4
 L(checkend2):
 	cmpdi	cr7,r0,0
 	mr	r3,r11
 	beq	cr7,L(nullfound)
 	blr
 
-	.p2align 4
+	.align 4
 L(checkend3):
 	cmpdi	cr6,r10,0
 	mr	r3,r5
diff --git a/sysdeps/powerpc/powerpc64/power7/strspn.S b/sysdeps/powerpc/powerpc64/strspn.S
index d587a673f2..daf5d5d747 100644
--- a/sysdeps/powerpc/powerpc64/power7/strspn.S
+++ b/sysdeps/powerpc/powerpc64/strspn.S
@@ -1,4 +1,4 @@
-/* Optimized strspn implementation for PowerPC64/POWER7.
+/* Optimized strspn implementation for PowerPC64.
 
    Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
@@ -25,8 +25,6 @@
    > hashing of needle.
    > hashing avoids scanning of duplicate entries in needle
      across the string.
-   > initializing the hash table with Vector instructions
-     by quadword access.
    > unrolling when scanning for character in string
      across hash table.  */
 
@@ -46,55 +44,36 @@
 
 #include <sysdep.h>
 
-#undef strspn
-
-	.machine  power7
 EALIGN(strspn, 4, 0)
-	CALL_MCOUNT 2
-
-	lbz r10, 0(r4)		/* load r10 with needle (r4)  */
-	addi r9, r1, -256	/* r9 is a hash of 256 bytes  */
-
-	li r5, 16		/* set r5 = 16 as offset  */
-	li r6, 32		/* set r6 = 32 as offset  */
-	li r8, 48		/* set r8 = 48 as offset  */
-
-/*Iniatliaze hash table with Zeroes in double indexed quadword accesses  */
-	xxlxor v0, v0, v0	/* prepare for initializing hash  */
-
-	stxvd2x v0, r0, r9	/* initialize 1st quadword  */
-	stxvd2x v0, r9, r5
-	stxvd2x v0, r9, r6
-	stxvd2x v0, r9, r8	/* initialize 4th quadword  */
-
-	addi r11, r9, 64	/* r11 is index to hash  */
-
-	stxvd2x v0, r0, r11	/* initialize 5th quadword  */
-	stxvd2x v0, r11, r5
-	stxvd2x v0, r11, r6
-	stxvd2x v0, r11, r8	/* initialize 8th quadword  */
-
-	addi r11, r9, 128	/* r11 is index to hash  */
-
-	stxvd2x v0, r0, r11	/* initialize 9th quadword  */
-	stxvd2x v0, r11, r5
-	stxvd2x v0, r11, r6
-	stxvd2x v0, r11, r8	/* initialize 12th quadword  */
-
-	addi r11, r9, 192	/* r11 is index to hash  */
-
-	stxvd2x v0, r0, r11	/* initialize 13th quadword  */
-	stxvd2x v0, r11, r5
-	stxvd2x v0, r11, r6
-	stxvd2x v0, r11, r8	/* initialize 16th quadword  */
-
+	CALL_MCOUNT 3
+
+	/* PPC64 ELF ABI stack is aligned to 16 bytes.  */
+	addi 	r9,r1,-256
+	/* Clear the table with 0 values  */
+	li	r6, 0
+	li	r8, 4
+	mtctr	r8
+	mr	r10, r9
+	.align 	4
+L(zerohash):
+	std	r6, 0(r10)
+	std	r6, 8(r10)
+	std	r6, 16(r10)
+	std	r6, 24(r10)
+	std	r6, 32(r10)
+	std	r6, 40(r10)
+	std	r6, 48(r10)
+	std	r6, 56(r10)
+	addi	r10, r10, 64
+	bdnz	L(zerohash)
+
+	lbz	r10,0(r4)
 	li r8, 1		/* r8=1, marker into hash if found in
 				   needle  */
-
 	cmpdi cr7, r10, 0	/* accept needle is NULL  */
 	beq cr7, L(skipHashing)	/* if needle is NULL, skip hashing  */
 
-	.p2align 4		/* align section to 16 byte boundary  */
+	.align 4		/* align section to 16 byte boundary  */
 L(hashing):
 	stbx r8, r9, r10	/* update hash with marker for the pivot of
 				   the needle  */
@@ -106,7 +85,7 @@ L(skipHashing):
 	li r10, 0		/* load counter = 0  */
 	b L(beginScan)
 
-	.p2align 4		/* align section to 16 byte boundary  */
+	.align 4		/* align section to 16 byte boundary  */
 L(scanUnroll):
 	lbzx r8, r9, r8		/* load r8 with hash value at index  */
 	cmpwi cr7, r8, 0	/* if we hit marker in hash, we have found
diff --git a/sysdeps/powerpc/powerpc64/strtok.S b/sysdeps/powerpc/powerpc64/strtok.S
new file mode 100644
index 0000000000..fa816f2950
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/strtok.S
@@ -0,0 +1,226 @@
+/* Optimized strtok implementation for PowerPC64.
+
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Performance gains are grabbed through following techniques:
+
+   > hashing of needle.
+   > hashing avoids scanning of duplicate entries in needle
+     across the string.
+   > unrolling when scanning for character in string
+     across hash table.  */
+
+/* Algorithm is as below:
+   1. A empty hash table/dictionary is created comprising of
+      256 ascii character set
+   2. When hash entry is found in needle , the hash index
+      is initialized to 1
+   3. The string is scanned until end and for every character,
+      its corresponding hash index is compared.
+   4. initial length of string (count) until first hit of
+      accept needle is calculated and moved.(strspn)
+   5. The string is again scanned until end and for every character,
+      its corresponding hash index is compared.(strpbrk)
+   6. If hash index is set to 1 for the index of string,
+      set it to null and set the saveptr to point to the next char.
+   7. Otherwise count is incremented and scanning continues
+      until end of string.  */
+
+#include <sysdep.h>
+#ifdef USE_AS_STRTOK_R
+# define FUNC_NAME __strtok_r
+#else
+# define FUNC_NAME strtok
+#endif
+
+EALIGN(FUNC_NAME, 4, 0)
+#ifdef USE_AS_STRTOK_R
+	CALL_MCOUNT	3
+	cmpdi	cr7, r3, 0		/* Is input null? */
+	bne	cr7, L(inputnotNull)
+	ld	r3, 0(r5)		/* Load from r5 */
+#else
+	CALL_MCOUNT	2
+	addis	r5, r2, .LANCHOR0@toc@ha
+	cmpdi	cr7, r3, 0		/* Is r3 NULL? */
+	bne	cr7, L(inputnotNull)
+	ld	r3, .LANCHOR0@toc@l(r5)	/* Load from saveptr */
+#endif
+L(inputnotNull):
+	mr	r7, r3
+	cmpdi	cr7, r3, 0
+	beq	cr7, L(returnNULL)
+	lbz	r8, 0(r3)
+	cmpdi	cr7, r8, 0
+	beq	cr7, L(returnNULL)
+
+	addi	r9, r1, -256	/* r9 is a hash of 256 bytes  */
+
+	/*Iniatliaze hash table with Zeroes */
+	li	r6, 0
+	li	r8, 4
+	mtctr	r8
+	mr	r10, r9
+	.align	4
+L(zerohash):
+	std	r6, 0(r10)
+	std	r6, 8(r10)
+	std	r6, 16(r10)
+	std	r6, 24(r10)
+	std	r6, 32(r10)
+	std	r6, 40(r10)
+	std	r6, 48(r10)
+	std	r6, 56(r10)
+	addi	r10, r10, 64
+	bdnz	L(zerohash)
+
+
+	lbz	r10, 0(r4)	/* load r10 with needle (r4)  */
+	li	r8, 1		/* r8=1, marker into hash if found in
+				   needle  */
+
+	cmpdi	cr7, r10, 0	/* accept needle is NULL  */
+	beq	cr7, L(skipHashing)	/* if needle is NULL, skip hashing  */
+
+	.align 4		/* align section to 16 byte boundary  */
+L(hashing):
+	stbx	r8, r9, r10	/* update hash with marker for the pivot of
+				   the needle  */
+	lbzu	r10, 1(r4)	/* load needle into r10 and update to next  */
+	cmpdi	cr7, r10, 0	/* if needle is has reached NULL, continue  */
+	bne	cr7, L(hashing)	/* loop to hash the needle  */
+
+L(skipHashing):
+	b	L(beginScan)
+
+	.align 4		/* align section to 16 byte boundary  */
+L(scanUnroll):
+	lbzx	r8, r9, r8	/* load r8 with hash value at index  */
+	cmpwi	cr7, r8, 0	/* check the hash  value */
+	beq	cr7, L(ret1stIndex)	/* we have hit accept needle */
+
+	lbz	r8, 1(r7)	/* load string[1] into r8  */
+	lbzx	r8, r9, r8	/* load r8 with hash value at index  */
+	cmpwi	cr7, r8, 0	/* check the hash  value */
+	beq	cr7, L(ret2ndIndex)	/* we have hit accept needle */
+
+	lbz	r8, 2(r7)	/* load string[1] into r8  */
+	lbzx	r8, r9, r8	/* load r8 with hash value at index  */
+	cmpwi	cr7, r8, 0	/* check the hash  value */
+	beq	cr7, L(ret3rdIndex)	/* we have hit accept needle */
+
+	lbz	r8, 3(r7)	/* load string[1] into r8  */
+	addi	r7, r7, 4
+	lbzx	r8, r9, r8	/* load r8 with hash value at index  */
+	cmpwi	cr7, r8, 0	/* check the hash  value */
+	beq	cr7,L(ret4thIndex)	/* we have hit accept needle */
+
+L(beginScan):
+	lbz	r8, 0(r7)	/* load string[0] into r8  */
+	addi	r6, r7, 1
+	addi	r11, r7, 2
+	addi	r4, r7, 3
+	cmpdi	cr7, r8, 0	/*  check if its null */
+	bne	cr7, L(scanUnroll)	/* continue scanning  */
+
+L(ret1stIndex):
+	mr 	r3, r7
+	b 	L(next)
+L(ret2ndIndex):
+	mr 	r3, r6
+	b 	L(next)
+L(ret3rdIndex):
+	mr 	r3, r11
+	b 	L(next)
+L(ret4thIndex):
+	mr 	r3, r4
+L(next):
+	mr	r7, r3
+	lbz	r8, 0(r7)
+	cmpdi	cr7, r8, 0
+	beq	cr7, L(returnNULL)
+	li	r8, 1
+	li	r10, 0		/* load counter = 0  */
+	stbx	r8, r9, r10	/* update hash for NULL */
+	b	L(mainloop)
+
+L(unroll):
+	lbz	r8, 1(r7)	/* load string[1] into r8  */
+	lbzx	r8, r9, r8	/* load r8 with hash value at index  */
+	cmpwi	r7, r8, 1	/* check the hash */
+	beq	cr7, L(foundat1st)	/* we have hit accept needle */
+	lbz	r8, 2(r7)
+	lbzx	r8, r9, r8
+	cmpwi	cr7, r8, 1
+	beq	cr7, L(foundat2nd)
+	lbz	r8, 3(r7)
+	addi	r7, r7, 4
+	lbzx	r8, r9, r8
+	cmpwi	cr7, r8, 1
+	beq	cr7, L(foundat3rd)
+L(mainloop):
+	lbz	r8, 0(r7)
+	addi	r6, r7, 1
+	addi	r11, r7, 2
+	addi	r4, r7, 3
+	lbzx	r8, r9, r8
+	cmpwi	cr7, r8, 1
+	bne	cr7, L(unroll)	/* continue scanning  */
+
+	b	L(found)
+L(foundat1st):
+	mr	r7, r6
+	b	L(found)
+L(foundat2nd):
+	mr	r7, r11
+	b	L(found)
+L(foundat3rd):
+	mr	r7, r4
+L(found):
+	lbz	r8, 0(r7)
+	cmpdi	cr7, r8, 0
+	beq	cr7, L(end)
+	li	r10, 0
+	stb	r10, 0(r7)	/* Terminate string */
+	addi	r7, r7, 1	/* Store the pointer to the next char */
+L(end):
+#ifdef USE_AS_STRTOK_R
+	std	r7, 0(r5)	/* Update saveptr */
+#else
+	std	r7, .LANCHOR0@toc@l(r5)
+#endif
+	blr			/* done  */
+L(returnNULL):
+#ifndef USE_AS_STRTOK_R
+	li	r7, 0
+#endif
+	li	r3, 0		/* return NULL */
+	b	L(end)
+END(FUNC_NAME)
+#ifdef USE_AS_STRTOK_R
+libc_hidden_builtin_def (strtok_r)
+#else
+	.section        ".bss"
+	.align 3
+	.set    .LANCHOR0,. + 0
+	.type   olds, @object
+	.size   olds, 8
+olds:
+	.zero   8
+libc_hidden_builtin_def (strtok)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c b/sysdeps/powerpc/powerpc64/strtok_r.S
index 3609d93ad2..6e5d301035 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
+++ b/sysdeps/powerpc/powerpc64/strtok_r.S
@@ -1,4 +1,4 @@
-/* Multiple versions of strcspn. PowerPC64 version.
+/* Optimized strtok_r implementation for PowerPC64.
    Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,16 +16,9 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#define USE_AS_STRTOK_R
+#include <sysdeps/powerpc/powerpc64/strtok.S>
 
-extern __typeof (strcspn) __strcspn_ppc attribute_hidden;
-extern __typeof (strcspn) __strcspn_power7 attribute_hidden;
-
-libc_ifunc (strcspn,
-	    (hwcap & PPC_FEATURE_HAS_VSX)
-	    ? __strcspn_power7
-	    : __strcspn_ppc);
-#endif
+weak_alias (__strtok_r, strtok_r)
+libc_hidden_def (__strtok_r)
+libc_hidden_builtin_def (strtok_r)
diff --git a/sysdeps/powerpc/powerpc64/sysdep.h b/sysdeps/powerpc/powerpc64/sysdep.h
index b28fb9d8aa..78722c6873 100644
--- a/sysdeps/powerpc/powerpc64/sysdep.h
+++ b/sysdeps/powerpc/powerpc64/sysdep.h
@@ -283,7 +283,23 @@ LT_LABELSUFFIX(name,_name_end): ; \
   TRACEBACK_MASK(name,mask)	\
   END_2(name)
 
+#if !defined IS_IN_rtld && defined (ENABLE_LOCK_ELISION)
+# define ABORT_TRANSACTION \
+    cmpdi    13,0;		\
+    beq      1f;		\
+    lwz      0,TM_CAPABLE(13);	\
+    cmpwi    0,0;		\
+    beq	     1f;		\
+    li	     0,_ABORT_SYSCALL;	\
+    tabort.  0;			\
+    .align 4;                   \
+1:
+#else
+# define ABORT_TRANSACTION
+#endif
+
 #define DO_CALL(syscall) \
+    ABORT_TRANSACTION \
     li 0,syscall; \
     sc
 
diff --git a/sysdeps/powerpc/sysdep.h b/sysdeps/powerpc/sysdep.h
index e6627c071f..04d109f0e0 100644
--- a/sysdeps/powerpc/sysdep.h
+++ b/sysdeps/powerpc/sysdep.h
@@ -21,6 +21,10 @@
  */
 #define _SYSDEPS_SYSDEP_H 1
 #include <bits/hwcap.h>
+#ifdef ENABLE_LOCK_ELISION
+#include <tls.h>
+#include <htm.h>
+#endif
 
 #define PPC_FEATURE_970 (PPC_FEATURE_POWER4 + PPC_FEATURE_HAS_ALTIVEC)
 
@@ -164,4 +168,22 @@
 #define ALIGNARG(log2) log2
 #define ASM_SIZE_DIRECTIVE(name) .size name,.-name
 
+#else
+
+/* Linux kernel powerpc documentation [1] states issuing a syscall inside a
+   transaction is not recommended and may lead to undefined behavior.  It
+   also states syscalls do not abort transactions.  To avoid such traps,
+   we abort transaction just before syscalls.
+
+   [1] Documentation/powerpc/transactional_memory.txt [Syscalls]  */
+#if !defined IS_IN_rtld && defined (ENABLE_LOCK_ELISION)
+# define ABORT_TRANSACTION \
+  ({ 						\
+    if (THREAD_GET_TM_CAPABLE ())		\
+      __builtin_tabort (_ABORT_SYSCALL);	\
+  })
+#else
+# define ABORT_TRANSACTION
+#endif
+
 #endif	/* __ASSEMBLER__ */
diff --git a/sysdeps/unix/sysv/linux/powerpc/Makefile b/sysdeps/unix/sysv/linux/powerpc/Makefile
index 28f7165815..974a1bf749 100644
--- a/sysdeps/unix/sysv/linux/powerpc/Makefile
+++ b/sysdeps/unix/sysv/linux/powerpc/Makefile
@@ -34,4 +34,6 @@ endif
 
 ifeq ($(subdir),nptl)
 libpthread-routines += sysdep
+libpthread-sysdep_routines += elision-lock elision-unlock elision-timed \
+			      elision-trylock
 endif
diff --git a/sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h b/sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h
index 4e9c5184aa..998f6d42b8 100644
--- a/sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h
+++ b/sysdeps/unix/sysv/linux/powerpc/bits/pthreadtypes.h
@@ -90,14 +90,23 @@ typedef union
        binary compatibility.  */
     int __kind;
 #if __WORDSIZE == 64
-    int __spins;
+    short __spins;
+    short __elision;
     __pthread_list_t __list;
 # define __PTHREAD_MUTEX_HAVE_PREV	1
+# define __PTHREAD_SPINS             0, 0
 #else
     unsigned int __nusers;
     __extension__ union
     {
-      int __spins;
+      struct
+      {
+	short __espins;
+	short __elision;
+# define __spins __elision_data.__espins
+# define __elision __elision_data.__elision
+# define __PTHREAD_SPINS         { 0, 0 }
+      } __elision_data;
       __pthread_slist_t __list;
     };
 #endif
@@ -106,9 +115,6 @@ typedef union
   long int __align;
 } pthread_mutex_t;
 
-/* Mutex __spins initializer used by PTHREAD_MUTEX_INITIALIZER.  */
-#define __PTHREAD_SPINS 0
-
 typedef union
 {
   char __size[__SIZEOF_PTHREAD_MUTEXATTR_T];
@@ -166,11 +172,13 @@ typedef union
     unsigned int __nr_writers_queued;
     int __writer;
     int __shared;
-    unsigned long int __pad1;
+    unsigned char __rwelision;
+    unsigned char __pad1[7];
     unsigned long int __pad2;
     /* FLAGS must stay at this position in the structure to maintain
        binary compatibility.  */
     unsigned int __flags;
+# define __PTHREAD_RWLOCK_ELISION_EXTRA 0, {0, 0, 0, 0, 0, 0, 0 }
   } __data;
 # else
   struct
@@ -181,20 +189,20 @@ typedef union
     unsigned int __writer_wakeup;
     unsigned int __nr_readers_queued;
     unsigned int __nr_writers_queued;
-    unsigned char __pad1;
+    unsigned char __rwelision;
     unsigned char __pad2;
     unsigned char __shared;
     /* FLAGS must stay at this position in the structure to maintain
        binary compatibility.  */
     unsigned char __flags;
     int __writer;
+#define __PTHREAD_RWLOCK_ELISION_EXTRA 0
   } __data;
 # endif
   char __size[__SIZEOF_PTHREAD_RWLOCK_T];
   long int __align;
 } pthread_rwlock_t;
 
-#define __PTHREAD_RWLOCK_ELISION_EXTRA 0
 
 typedef union
 {
diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-conf.c b/sysdeps/unix/sysv/linux/powerpc/elision-conf.c
new file mode 100644
index 0000000000..70fbbb2215
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-conf.c
@@ -0,0 +1,83 @@
+/* elision-conf.c: Lock elision tunable parameters.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include <pthreadP.h>
+#include <elision-conf.h>
+#include <unistd.h>
+#include <dl-procinfo.h>
+
+/* Reasonable initial tuning values, may be revised in the future.
+   This is a conservative initial value.  */
+
+struct elision_config __elision_aconf =
+  {
+    /* How many times to use a non-transactional lock after a transactional
+       failure has occurred because the lock is already acquired.  Expressed
+       in number of lock acquisition attempts.  */
+    .skip_lock_busy = 3,
+    /* How often to not attempt to use elision if a transaction aborted due
+       to reasons other than other threads' memory accesses.  Expressed in
+       number of lock acquisition attempts.  */
+    .skip_lock_internal_abort = 3,
+    /* How often to not attempt to use elision if a lock used up all retries
+       without success.  Expressed in number of lock acquisition attempts.  */
+    .skip_lock_out_of_tbegin_retries = 3,
+    /* How often we retry using elision if there is chance for the transaction
+       to finish execution (e.g., it wasn't aborted due to the lock being
+       already acquired.  */
+    .try_tbegin = 3,
+    /* Same as SKIP_LOCK_INTERNAL_ABORT but for trylock.  */
+    .skip_trylock_internal_abort = 3,
+  };
+
+/* Force elision for all new locks.  This is used to decide whether existing
+   DEFAULT locks should be automatically use elision in pthread_mutex_lock().
+   Disabled for suid programs.  Only used when elision is available.  */
+
+int __pthread_force_elision attribute_hidden;
+
+/* Initialize elision.  */
+
+static void
+elision_init (int argc __attribute__ ((unused)),
+	      char **argv  __attribute__ ((unused)),
+	      char **environ)
+{
+#ifdef ENABLE_LOCK_ELISION
+  int elision_available = (GLRO (dl_hwcap2) & PPC_FEATURE2_HAS_HTM) ? 1 : 0;
+  __pthread_force_elision = __libc_enable_secure ? 0 : elision_available;
+#endif
+  if (!__pthread_force_elision)
+    /* Disable elision on rwlocks.  */
+    __elision_aconf.try_tbegin = 0;
+}
+
+#ifdef SHARED
+# define INIT_SECTION ".init_array"
+# define MAYBE_CONST
+#else
+# define INIT_SECTION ".preinit_array"
+# define MAYBE_CONST const
+#endif
+
+void (*MAYBE_CONST __pthread_init_array []) (int, char **, char **)
+  __attribute__ ((section (INIT_SECTION), aligned (sizeof (void *)))) =
+{
+  &elision_init
+};
diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-conf.h b/sysdeps/unix/sysv/linux/powerpc/elision-conf.h
new file mode 100644
index 0000000000..fc9994de97
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-conf.h
@@ -0,0 +1,42 @@
+/* elision-conf.h: Lock elision tunable parameters.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _ELISION_CONF_H
+#define _ELISION_CONF_H 1
+
+#include <pthread.h>
+#include <time.h>
+
+/* Should make sure there is no false sharing on this.  */
+struct elision_config
+{
+  int skip_lock_busy;
+  int skip_lock_internal_abort;
+  int skip_lock_out_of_tbegin_retries;
+  int try_tbegin;
+  int skip_trylock_internal_abort;
+} __attribute__ ((__aligned__ (128)));
+
+extern struct elision_config __elision_aconf attribute_hidden;
+
+extern int __pthread_force_elision attribute_hidden;
+
+/* Tell the test suite to test elision for this architecture.  */
+#define HAVE_ELISION 1
+
+#endif
diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-lock.c b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
new file mode 100644
index 0000000000..2ce75b20b8
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-lock.c
@@ -0,0 +1,107 @@
+/* elision-lock.c: Elided pthread mutex lock.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <pthread.h>
+#include <pthreadP.h>
+#include <lowlevellock.h>
+#include <elision-conf.h>
+#include "htm.h"
+
+/* PowerISA 2.0.7 Section B.5.5 defines isync to be insufficient as a
+   barrier in acquire mechanism for HTM operations, a strong 'sync' is
+   required.  */
+#undef __arch_compare_and_exchange_val_32_acq
+#define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval)           \
+  ({                                                                          \
+      __typeof (*(mem)) __tmp;                                                \
+      __typeof (mem)  __memp = (mem);                                         \
+      __asm __volatile (                                                      \
+                        "1:     lwarx   %0,0,%1" MUTEX_HINT_ACQ "\n"          \
+                        "       cmpw    %0,%2\n"                              \
+                        "       bne     2f\n"                                 \
+                        "       stwcx.  %3,0,%1\n"                            \
+                        "       bne-    1b\n"                                 \
+                        "2:     sync"                                         \
+                        : "=&r" (__tmp)                                       \
+                        : "b" (__memp), "r" (oldval), "r" (newval)            \
+                        : "cr0", "memory");                                   \
+      __tmp;                                                                  \
+  })
+
+#if !defined(LLL_LOCK) && !defined(EXTRAARG)
+/* Make sure the configuration code is always linked in for static
+   libraries.  */
+#include "elision-conf.c"
+#endif
+
+#ifndef EXTRAARG
+# define EXTRAARG
+#endif
+#ifndef LLL_LOCK
+# define LLL_LOCK(a,b) lll_lock(a,b), 0
+#endif
+
+#define aconf __elision_aconf
+
+/* Adaptive lock using transactions.
+   By default the lock region is run as a transaction, and when it
+   aborts or the lock is busy the lock adapts itself.  */
+
+int
+__lll_lock_elision (int *lock, short *adapt_count, EXTRAARG int pshared)
+{
+  if (*adapt_count > 0)
+    {
+      (*adapt_count)--;
+      goto use_lock;
+    }
+
+  int try_begin = aconf.try_tbegin;
+  while (1)
+    {
+      if (__builtin_tbegin (0))
+	{
+	  if (*lock == 0)
+	    return 0;
+	  /* Lock was busy.  Fall back to normal locking.  */
+	  __builtin_tabort (_ABORT_LOCK_BUSY);
+	}
+      else
+	{
+	  /* A persistent failure indicates that a retry will probably
+	     result in another failure.  Use normal locking now and
+	     for the next couple of calls.  */
+	  if (try_begin-- <= 0
+	      || _TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
+	    {
+	      if (aconf.skip_lock_internal_abort > 0)
+		*adapt_count = aconf.skip_lock_internal_abort;
+	      goto use_lock;
+	    }
+	  /* Same logic as above, but for for a number of temporary failures
+	     in a row.  */
+	  else if (aconf.skip_lock_out_of_tbegin_retries > 0
+                   && aconf.try_tbegin > 0)
+	    *adapt_count = aconf.skip_lock_out_of_tbegin_retries;
+	}
+     }
+
+use_lock:
+  return LLL_LOCK ((*lock), pshared);
+}
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c b/sysdeps/unix/sysv/linux/powerpc/elision-timed.c
index 8b05536ae1..7d5de9b681 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-timed.c
@@ -1,4 +1,4 @@
-/* Multiple versions of strpbrk. PowerPC64 version.
+/* elision-timed.c: Lock elision timed lock.
    Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,16 +16,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <time.h>
+#include <elision-conf.h>
+#include "lowlevellock.h"
 
-extern __typeof (strpbrk) __strpbrk_ppc attribute_hidden;
-extern __typeof (strpbrk) __strpbrk_power7 attribute_hidden;
+#define __lll_lock_elision __lll_timedlock_elision
+#define EXTRAARG const struct timespec *t,
+#undef LLL_LOCK
+#define LLL_LOCK(a, b) lll_timedlock(a, t, b)
 
-libc_ifunc (strpbrk,
-	    (hwcap & PPC_FEATURE_HAS_VSX)
-	    ? __strpbrk_power7
-	    : __strpbrk_ppc);
-#endif
+#include "elision-lock.c"
diff --git a/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
new file mode 100644
index 0000000000..4a310d5945
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-trylock.c
@@ -0,0 +1,68 @@
+/* elision-trylock.c: Lock eliding trylock for pthreads.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+#include <pthreadP.h>
+#include <lowlevellock.h>
+#include <elision-conf.h>
+#include "htm.h"
+
+#define aconf __elision_aconf
+
+/* Try to elide a futex trylock.  FUTEX is the futex variable.  ADAPT_COUNT is
+   the adaptation counter in the mutex.  */
+
+int
+__lll_trylock_elision (int *futex, short *adapt_count)
+{
+  /* Implement POSIX semantics by forbiding nesting elided trylocks.  */
+  __builtin_tabort (_ABORT_NESTED_TRYLOCK);
+
+  /* Only try a transaction if it's worth it.  */
+  if (*adapt_count > 0)
+    {
+      (*adapt_count)--;
+      goto use_lock;
+    }
+
+  if (__builtin_tbegin (0))
+    {
+      if (*futex == 0)
+	return 0;
+
+      /* Lock was busy.  Fall back to normal locking.  */
+      __builtin_tabort (_ABORT_LOCK_BUSY);
+    }
+  else
+    {
+      if (_TEXASRU_FAILURE_PERSISTENT (__builtin_get_texasru ()))
+	{
+	  /* A persistent failure indicates that a retry will probably
+	     result in another failure.  Use normal locking now and
+	     for the next couple of calls.  */
+	  if (aconf.skip_trylock_internal_abort > 0)
+	    *adapt_count = aconf.skip_trylock_internal_abort;
+	}
+
+	if (aconf.skip_lock_busy > 0)
+	  *adapt_count = aconf.skip_lock_busy;
+    }
+
+use_lock:
+  return lll_trylock (*futex);
+}
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
index bf8c877ec2..59d46bb43a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn.c
+++ b/sysdeps/unix/sysv/linux/powerpc/elision-unlock.c
@@ -1,4 +1,4 @@
-/* Multiple versions of strspn. PowerPC64 version.
+/* elision-unlock.c: Commit an elided pthread lock.
    Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,16 +16,17 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include "pthreadP.h"
+#include "lowlevellock.h"
+#include "htm.h"
 
-extern __typeof (strspn) __strspn_ppc attribute_hidden;
-extern __typeof (strspn) __strspn_power7 attribute_hidden;
-
-libc_ifunc (strspn,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strspn_power7
-            : __strspn_ppc);
-#endif
+int
+__lll_unlock_elision(int *lock, int pshared)
+{
+  /* When the lock was free we're in a transaction.  */
+  if (*lock == 0)
+    __builtin_tend (0);
+  else
+    lll_unlock ((*lock), pshared);
+  return 0;
+}
diff --git a/sysdeps/unix/sysv/linux/powerpc/force-elision.h b/sysdeps/unix/sysv/linux/powerpc/force-elision.h
new file mode 100644
index 0000000000..3da576b944
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/force-elision.h
@@ -0,0 +1,28 @@
+/* force-elision.h: Automatic enabling of elision for mutexes
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef ENABLE_LOCK_ELISION
+/* Automatically enable elision for existing user lock kinds.  */
+#define FORCE_ELISION(m, s)						\
+  if (__pthread_force_elision						\
+      && (m->__data.__kind & PTHREAD_MUTEX_ELISION_FLAGS_NP) == 0)	\
+    {									\
+      mutex->__data.__kind |= PTHREAD_MUTEX_ELISION_NP;			\
+      s;								\
+    }
+#endif
diff --git a/sysdeps/unix/sysv/linux/powerpc/htm.h b/sysdeps/unix/sysv/linux/powerpc/htm.h
new file mode 100644
index 0000000000..4a570bea6e
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/htm.h
@@ -0,0 +1,138 @@
+/* Shared HTM header.  Emulate transactional execution facility intrinsics for
+   compilers and assemblers that do not support the intrinsics and instructions
+   yet.
+
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _HTM_H
+#define _HTM_H 1
+
+#ifdef __ASSEMBLER__
+
+/* tbegin.  */
+.macro TBEGIN
+	.long 0x7c00051d
+.endm
+
+/* tend. 0  */
+.macro TEND
+	.long 0x7c00055d
+.endm
+
+/* tabort. code  */
+.macro TABORT code
+	.byte 0x7c
+	.byte \code
+	.byte 0x07
+	.byte 0x1d
+.endm
+
+/*"TEXASR - Transaction EXception And Summary Register"
+   mfspr %dst,130  */
+.macro TEXASR dst
+	mfspr \dst,130
+.endm
+
+#else
+
+#include <endian.h>
+
+/* Official HTM intrinsics interface matching GCC, but works
+   on older GCC compatible compilers and binutils.
+   We should somehow detect if the compiler supports it, because
+   it may be able to generate slightly better code.  */
+
+#define TBEGIN ".long 0x7c00051d"
+#define TEND   ".long 0x7c00055d"
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+# define TABORT ".byte 0x1d,0x07,%1,0x7c"
+#else
+# define TABORT ".byte 0x7c,%1,0x07,0x1d"
+#endif
+
+#define __force_inline        inline __attribute__((__always_inline__))
+
+#ifndef __HTM__
+
+#define _TEXASRU_EXTRACT_BITS(TEXASR,BITNUM,SIZE) \
+  (((TEXASR) >> (31-(BITNUM))) & ((1<<(SIZE))-1))
+#define _TEXASRU_FAILURE_PERSISTENT(TEXASRU) \
+  _TEXASRU_EXTRACT_BITS(TEXASRU, 7, 1)
+
+#define _tbegin()			\
+  ({ unsigned int __ret;		\
+     asm volatile (			\
+       TBEGIN "\t\n"			\
+       "mfcr   %0\t\n"			\
+       "rlwinm %0,%0,3,1\t\n"		\
+       "xori %0,%0,1\t\n"		\
+       : "=r" (__ret) :			\
+       : "cr0", "memory");		\
+     __ret;				\
+  })
+
+#define _tend()				\
+  ({ unsigned int __ret;		\
+     asm volatile (			\
+       TEND "\t\n"			\
+       "mfcr   %0\t\n"			\
+       "rlwinm %0,%0,3,1\t\n"		\
+       "xori %0,%0,1\t\n"		\
+       : "=r" (__ret) :			\
+       : "cr0", "memory");		\
+     __ret;				\
+  })
+
+#define _tabort(__code)			\
+  ({ unsigned int __ret;		\
+     asm volatile (			\
+       TABORT "\t\n"			\
+       "mfcr   %0\t\n"			\
+       "rlwinm %0,%0,3,1\t\n"		\
+       "xori %0,%0,1\t\n"		\
+       : "=r" (__ret) : "r" (__code)	\
+       : "cr0", "memory");		\
+     __ret;				\
+  })
+
+#define _texasru()			\
+  ({ unsigned long __ret;		\
+     asm volatile (			\
+       "mfspr %0,131\t\n"		\
+       : "=r" (__ret));			\
+     __ret;				\
+  })
+
+#define __builtin_tbegin(tdb)       _tbegin ()
+#define __builtin_tend(nested)      _tend ()
+#define __builtin_tabort(abortcode) _tabort (abortcode)
+#define __builtin_get_texasru()     _texasru ()
+
+#else
+# include <htmintrin.h>
+#endif /* __HTM__  */
+
+#endif /* __ASSEMBLER__ */
+
+/* Definitions used for TEXASR Failure code (bits 0:6), they need to be even
+   because tabort. always sets the first bit.  */
+#define _ABORT_LOCK_BUSY       0x3f   /* Lock already used.  */
+#define _ABORT_NESTED_TRYLOCK  0x3e   /* Write operation in trylock.  */
+#define _ABORT_SYSCALL         0x3d   /* Syscall issued.  */
+
+#endif
diff --git a/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h b/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
index a651d23c50..0e930d00bc 100644
--- a/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
+++ b/sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2003-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Paul Mackerras <paulus@au.ibm.com>, 2003.
 
@@ -339,4 +339,28 @@ extern int __lll_timedwait_tid (int *, const struct timespec *)
     __res;								      \
   })
 
+/* Transactional lock elision definitions.  */
+extern int __lll_timedlock_elision
+  (int *futex, short *adapt_count, const struct timespec *timeout, int private)
+  attribute_hidden;
+
+#define lll_timedlock_elision(futex, adapt_count, timeout, private)	      \
+  __lll_timedlock_elision(&(futex), &(adapt_count), timeout, private)
+
+extern int __lll_lock_elision (int *futex, short *adapt_count, int private)
+  attribute_hidden;
+
+extern int __lll_unlock_elision(int *lock, int private)
+  attribute_hidden;
+
+extern int __lll_trylock_elision(int *lock, short *adapt_count)
+  attribute_hidden;
+
+#define lll_lock_elision(futex, adapt_count, private) \
+  __lll_lock_elision (&(futex), &(adapt_count), private)
+#define lll_unlock_elision(futex, private) \
+  __lll_unlock_elision (&(futex), private)
+#define lll_trylock_elision(futex, adapt_count) \
+  __lll_trylock_elision (&(futex), &(adapt_count))
+
 #endif	/* lowlevellock.h */
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h b/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h
index 1a5e37a1d9..0947ca34a6 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep.h
@@ -194,6 +194,7 @@
     register long int r11 __asm__ ("r11");				\
     register long int r12 __asm__ ("r12");				\
     LOADARGS_##nr(name, args);						\
+    ABORT_TRANSACTION;							\
     __asm__ __volatile__						\
       ("sc   \n\t"							\
        "mfcr %0"							\
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h
index 93e454e902..a3cc3025e0 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep.h
@@ -201,6 +201,7 @@
     register long int r7  __asm__ ("r7");				\
     register long int r8  __asm__ ("r8");				\
     LOADARGS_##nr (name, ##args);					\
+    ABORT_TRANSACTION;							\
     __asm__ __volatile__						\
       ("sc\n\t"								\
        "mfcr  %0\n\t"							\
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_cond_lock.c
index 72b75acff7..aa6cf9a79e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S
+++ b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_cond_lock.c
@@ -1,5 +1,4 @@
-/* Optimized bzero implementation for PowerPC64/POWER4.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+/* Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,11 +15,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
+/* The cond lock is not actually elided yet, but we still need to handle
+   already elided locks.  */
+#include <elision-conf.h>
 
-ENTRY (__bzero_power4)
-	CALL_MCOUNT 3
-	mr	r5,r4
-	li	r4,0
-	b	__memset_power4
-END_GEN_TB (__bzero_power4,TB_TOCLESS)
+#include <nptl/pthread_mutex_cond_lock.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_lock.c
index d0917c5e66..6fd6a9866f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S
+++ b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_lock.c
@@ -1,5 +1,5 @@
-/* Optimized bzero implementation for PowerPC64/POWER6.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+/* Elided version of pthread_mutex_lock.
+   Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,11 +16,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
+#include <elision-conf.h>
+#include <force-elision.h>
 
-ENTRY (__bzero_power6)
-	CALL_MCOUNT 3
-	mr	r5,r4
-	li	r4,0
-	b	__memset_power6
-END_GEN_TB (__bzero_power6,TB_TOCLESS)
+#include <nptl/pthread_mutex_lock.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_timedlock.c
index 0ec285a9bd..d0e6537ecc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S
+++ b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_timedlock.c
@@ -1,5 +1,5 @@
-/* Optimized bzero implementation for PowerPC64/POWER7.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+/* Elided version of pthread_mutex_timedlock.
+   Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,11 +16,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
+#include <elision-conf.h>
+#include <force-elision.h>
 
-ENTRY (__bzero_power7)
-	CALL_MCOUNT 3
-	mr	r5,r4
-	li	r4,0
-	b	__memset_power7
-END_GEN_TB (__bzero_power7,TB_TOCLESS)
+#include <nptl/pthread_mutex_timedlock.c>
diff --git a/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_trylock.c b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_trylock.c
new file mode 100644
index 0000000000..ea8a8fff93
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/pthread_mutex_trylock.c
@@ -0,0 +1,22 @@
+/* Elided version of pthread_mutex_trylock.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <elision-conf.h>
+#include <force-elision.h>
+
+#include <nptl/pthread_mutex_trylock.c>