Prepare for radical source tree reorganization. zack/build-layout-experiment

All top-level files and directories are moved into a temporary storage directory, REORG.TODO, except for files that will certainly still exist in their current form at top level when we're done (COPYING, COPYING.LIB, LICENSES, NEWS, README), all old ChangeLog files (which are moved to the new directory OldChangeLogs, instead), and the generated file INSTALL (which is just deleted; in the new order, there will be no generated files checked into version control).
author: Zack Weinberg <zackw@panix.com> 2017-06-08 15:39:03 -0400
committer: Zack Weinberg <zackw@panix.com> 2017-06-08 15:39:03 -0400
commit: 5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree: 4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/powerpc/powerpc64/power7
parent: 199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
download: glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.xz
glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip
38 files changed, 6793 insertions, 0 deletions
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Implies
new file mode 100644
index 0000000000..9d68f39d22
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc64/power6/fpu
+powerpc/powerpc64/power6
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Makefile b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Makefile
new file mode 100644
index 0000000000..89a2296085
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/Makefile
@@ -0,0 +1,11 @@
+ifeq ($(subdir),elf)
+# Prevent the use of VSX registers and insns in _dl_start, which under -O3
+# optimization may require a TOC reference before relocations are resolved.
+CFLAGS-rtld.c += -mno-vsx
+endif
+
+ifeq ($(subdir),string)
+sysdep_routines += strstr-ppc64
+CFLAGS-strncase.c += -funroll-loops
+CFLAGS-strncase_l.c += -funroll-loops
+endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/add_n.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/add_n.S
new file mode 100644
index 0000000000..6425afbc9f
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/add_n.S
@@ -0,0 +1,98 @@
+/* PowerPC64 mpn_lshift --  mpn_add_n/mpn_sub_n -- mpn addition and
+   subtraction.
+   Copyright (C) 2003-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/*             cycles/limb
+ * POWER7             2.18
+ */
+
+#ifdef USE_AS_SUB
+# define FUNC      __mpn_sub_n
+# define ADDSUBC   subfe
+#else
+# define FUNC      __mpn_add_n
+# define ADDSUBC   adde
+#endif
+
+#define RP  r3
+#define UP  r4
+#define VP  r5
+#define N   r6
+
+EALIGN(FUNC, 5, 0)
+#ifdef USE_AS_SUB
+	addic	r0, r0, 0
+#else
+	addic   r0, r1, -1
+#endif
+	andi.	r7, N, 1
+	beq	L(bx0)
+
+	ld	r7, 0(UP)
+	ld	r9, r0(VP)
+	ADDSUBC	r11, r9, r7
+	std	r11, r0(RP)
+	cmpldi	N, N, 1
+	beq	N, L(end)
+	addi	UP, UP, 8
+	addi	VP, VP, 8
+	addi	RP, RP, 8
+
+L(bx0):	addi	r0, N, 2
+	srdi	r0, r0, 2
+	mtctr	r0
+
+	andi.	r7, N, 2
+	bne	L(mid)
+
+	addi	UP, UP, 16
+	addi	VP, VP, 16
+	addi	RP, RP, 16
+
+	.align	5
+L(top):	ld	r6, -16(UP)
+	ld	r7, -8(UP)
+	ld	r8, -16(VP)
+	ld	r9, -8(VP)
+	ADDSUBC	r10, r8, N
+	ADDSUBC	r11, r9, r7
+	std	r10, -16(RP)
+	std	r11, -8(RP)
+L(mid):	ld	r6, 0(UP)
+	ld	r7, 8(UP)
+	ld	r8, 0(VP)
+	ld	r9, 8(VP)
+	ADDSUBC	r10, r8, N
+	ADDSUBC	r11, r9, r7
+	std	r10, 0(RP)
+	std	r11, 8(RP)
+	addi	UP, UP, 32
+	addi	VP, VP, 32
+	addi	RP, RP, 32
+	bdnz	L(top)
+
+L(end):	subfe	r3, r0, r0
+#ifdef USE_AS_SUB
+	neg	r3, r3
+#else
+	addi	r3, r3, 1
+#endif
+	blr
+END(FUNC)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/bcopy.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/bcopy.c
new file mode 100644
index 0000000000..4a6a400e7a
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/bcopy.c
@@ -0,0 +1 @@
+/* Implemented at memmove.S  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/Implies
new file mode 100644
index 0000000000..30fa17646e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6/fpu
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies
new file mode 100644
index 0000000000..410d289a6d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6/fpu/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
new file mode 100644
index 0000000000..9ccc758c9e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finite.S
@@ -0,0 +1,70 @@
+/* finite().  PowerPC64/POWER7 version.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __finite(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __finite, @function
+	.machine    power7
+EALIGN (__finite, 4, 0)
+	CALL_MCOUNT 0
+	lfd     fp0,.LC0@toc(r2)
+	ftdiv   cr7,fp1,fp0
+	li	r3,1
+	bflr    30
+
+	/* If we are here, we either have +/-INF,
+	NaN or denormal.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz     r4,-16+HISHORT(r1)  /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
+	clrlwi  r4,r4,17      /* r4 = abs(r4).  */
+	cmpwi   cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	bltlr   cr7	      /* LT means finite, other non-finite.  */
+	li      r3,0
+	blr
+	END (__finite)
+
+hidden_def (__finite)
+weak_alias (__finite, finite)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__finite, __finitef)
+hidden_def (__finitef)
+weak_alias (__finitef, finitef)
+
+#if IS_IN (libm)
+# if LONG_DOUBLE_COMPAT (libm, GLIBC_2_0)
+compat_symbol (libm, __finite, __finitel, GLIBC_2_0)
+compat_symbol (libm, finite, finitel, GLIBC_2_0)
+# endif
+#else
+# if LONG_DOUBLE_COMPAT (libc, GLIBC_2_0)
+compat_symbol (libc, __finite, __finitel, GLIBC_2_0);
+compat_symbol (libc, finite, finitel, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
new file mode 100644
index 0000000000..54bd94176d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_finitef.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_finite.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
new file mode 100644
index 0000000000..4482cddcfa
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinf.S
@@ -0,0 +1,69 @@
+/* isinf().  PowerPC64/POWER7 version.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isinf(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __isinf, @function
+	.machine    power7
+EALIGN (__isinf, 4, 0)
+	CALL_MCOUNT 0
+	lfd	fp0,.LC0@toc(r2)
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr    29	      /* If not INF, return.  */
+
+	/* Either we have -INF/+INF or a denormal.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	lhz	r4,-16+HISHORT(r1)  /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
+	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
+	li	r3,1
+	beqlr   cr7	      /* EQ means INF, otherwise -INF.  */
+	li      r3,-1
+	blr
+	END (__isinf)
+
+hidden_def (__isinf)
+weak_alias (__isinf, isinf)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isinf, __isinff)
+hidden_def (__isinff)
+weak_alias (__isinff, isinff)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isinf, __isinfl)
+weak_alias (__isinf, isinfl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isinf, __isinfl, GLIBC_2_0);
+compat_symbol (libc, isinf, isinfl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
new file mode 100644
index 0000000000..be759e091e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isinff.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isinf.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
new file mode 100644
index 0000000000..46b08a0d37
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnan.S
@@ -0,0 +1,68 @@
+/* isnan().  PowerPC64/POWER7 version.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <math_ldbl_opt.h>
+
+/* int __isnan(x)  */
+	.section    ".toc","aw"
+.LC0:   /* 1.0 */
+	.tc	    FD_ONE[TC],0x3ff0000000000000
+	.section    ".text"
+	.type	    __isnan, @function
+	.machine    power7
+EALIGN (__isnan, 4, 0)
+	CALL_MCOUNT 0
+	lfd	fp0,.LC0@toc(r2)
+	ftdiv	cr7,fp1,fp0
+	li	r3,0
+	bflr	30	      /* If not NaN, finish.  */
+
+	stfd    fp1,-16(r1)   /* Transfer FP to GPR's.  */
+	ori	2,2,0	      /* Force a new dispatch group.  */
+	ld	r4,-16(r1)    /* Load FP into GPR.  */
+	lis     r0,0x7ff0
+	sldi	r0,r0,32      /* const long r0 0x7ff00000 00000000.  */
+	clrldi	r4,r4,1	      /* x = fabs(x)  */
+	cmpd	cr7,r4,r0     /* if (fabs(x) <= inf)  */
+	blelr	cr7	      /* LE means not NaN.  */
+	li	r3,1	      /* else return 1  */
+	blr
+	END (__isnan)
+
+hidden_def (__isnan)
+weak_alias (__isnan, isnan)
+
+/* It turns out that the 'double' version will also always work for
+   single-precision.  */
+strong_alias (__isnan, __isnanf)
+hidden_def (__isnanf)
+weak_alias (__isnanf, isnanf)
+
+#ifdef NO_LONG_DOUBLE
+strong_alias (__isnan, __isnanl)
+weak_alias (__isnan, isnanl)
+#endif
+
+#if !IS_IN (libm)
+# if LONG_DOUBLE_COMPAT(libc, GLIBC_2_0)
+compat_symbol (libc, __isnan, __isnanl, GLIBC_2_0);
+compat_symbol (libc, isnan, isnanl, GLIBC_2_0);
+# endif
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
new file mode 100644
index 0000000000..b48c85e0d3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_isnanf.S
@@ -0,0 +1 @@
+/* This function uses the same code as s_isnan.S.  */
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c
new file mode 100644
index 0000000000..2599c771d9
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logb.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power7/fpu/s_logb.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c
new file mode 100644
index 0000000000..7a5a8032e0
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbf.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power7/fpu/s_logbf.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c
new file mode 100644
index 0000000000..524ae2c78d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/fpu/s_logbl.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/power7/fpu/s_logbl.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memchr.S
new file mode 100644
index 0000000000..5e9707aa02
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memchr.S
@@ -0,0 +1,199 @@
+/* Optimized memchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memchr (char *s [r3], int byte [r4], int size [r5])  */
+
+#ifndef MEMCHR
+# define MEMCHR __memchr
+#endif
+	.machine  power7
+ENTRY (MEMCHR)
+	CALL_MCOUNT 3
+	dcbt	0,r3
+	clrrdi  r8,r3,3
+	insrdi	r4,r4,8,48
+
+	/* Calculate the last acceptable address and check for possible
+	   addition overflow by using satured math:
+	   r7 = r3 + r5
+	   r7 |= -(r7 < x)  */
+	add     r7,r3,r5
+	subfc   r6,r3,r7
+	subfe   r9,r9,r9
+	extsw   r6,r9
+	or      r7,r7,r6
+
+	insrdi	r4,r4,16,32
+	cmpldi	r5,32
+	li	r9, -1
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	insrdi  r4,r4,32,0
+	addi	r7,r7,-1
+#ifdef __LITTLE_ENDIAN__
+	sld	r9,r9,r6
+#else
+	srd	r9,r9,r6
+#endif
+	ble	L(small_range)
+
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	cmpb	r3,r12,r4     /* Check for BYTEs in DWORD1.  */
+	and	r3,r3,r9
+	clrldi	r5,r7,61      /* Byte count - 1 in last dword.  */
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+	cmpldi	cr7,r3,0      /* Does r3 indicate we got a hit?  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+	bt	28,L(loop_setup)
+
+	/* Handle DWORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+
+L(loop_setup):
+	/* The last dword we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the dword at
+	   (s + size - 1) & ~7, or r7.  The first dword read is at
+	   r8 + 8, we read 2 * cnt dwords, so the last dword read will
+	   be at r8 + 8 + 16 * cnt - 8.  Solving for cnt gives
+	   cnt = (r7 - r8) / 16  */
+	sub	r6,r7,r8
+	srdi	r6,r6,4	      /* Number of loop iterations.  */
+	mtctr	r6            /* Setup the counter.  */
+
+	/* Main loop to look for BYTE in the string.  Since
+	   it's a small loop (8 instructions), align it to 32-bytes.  */
+	.align	5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the byte-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r3,r12,r4
+	cmpb	r9,r11,r4
+	or	r6,r9,r3      /* Merge everything in one doubleword.  */
+	cmpldi	cr7,r6,0
+	bne	cr7,L(found)
+	bdnz	L(loop)
+
+	/* We may have one more dword to read.  */
+	cmpld	r8,r7
+	beqlr
+
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
+
+	.align	4
+L(found):
+	/* OK, one (or both) of the doublewords contains BYTE.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains BYTE.  */
+	cmpldi	cr6,r3,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* BYTE must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r3 so we can calculate the
+	   pointer.  */
+
+	mr	r3,r9
+	addi	r8,r8,8
+
+	/* r3 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as BYTE in the original
+	   doubleword from the string.  Use that to calculate the pointer.
+	   We need to make sure BYTE is *before* the end of the range.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r3,-1
+	andc    r0,r0,r3
+	popcntd	r0,r0	      /* Count trailing zeros.  */
+#else
+	cntlzd	r0,r3	      /* Count leading zeros before the match.  */
+#endif
+	cmpld	r8,r7         /* Are we on the last dword?  */
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r8,r0
+	cmpld	cr7,r0,r5     /* If on the last dword, check byte offset.  */
+	bnelr
+	blelr	cr7
+	li	r3,0
+	blr
+
+	.align	4
+L(null):
+	li	r3,0
+	blr
+
+/* Deals with size <= 32.  */
+	.align	4
+L(small_range):
+	cmpldi	r5,0
+	beq	L(null)
+	ld	r12,0(r8)     /* Load word from memory.  */
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0
+	clrldi	r5,r7,61      /* Byte count - 1 in last dword.  */
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+	cmpld	r8,r7         /* Are we done already?  */
+	bne	cr7,L(done)
+	beqlr
+
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
+	bne	cr6,L(done)   /* Found something.  */
+	beqlr		      /* Hit end of string (length).  */
+
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
+	bne	cr6,L(done)
+	beqlr
+
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	cmpld	r8,r7
+	bne	cr6,L(done)
+	beqlr
+
+	ldu	r12,8(r8)
+	cmpb	r3,r12,r4
+	cmpldi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
+
+END (MEMCHR)
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcmp.S
new file mode 100644
index 0000000000..96ce8cee25
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -0,0 +1,1061 @@
+/* Optimized memcmp implementation for POWER7/PowerPC64.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memcmp (const char *s1 [r3],
+		    const char *s2 [r4],
+		    size_t size [r5])  */
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+	.machine power7
+EALIGN (MEMCMP, 4, 0)
+	CALL_MCOUNT 3
+
+#define rRTN		r3
+#define rSTR1		r3	/* first string arg */
+#define rSTR2		r4	/* second string arg */
+#define rN		r5	/* max string length */
+#define rWORD1		r6	/* current word in s1 */
+#define rWORD2		r7	/* current word in s2 */
+#define rWORD3		r8	/* next word in s1 */
+#define rWORD4		r9	/* next word in s2 */
+#define rWORD5		r10	/* next word in s1 */
+#define rWORD6		r11	/* next word in s2 */
+
+#define rOFF8		r20	/* 8 bytes offset.  */
+#define rOFF16  	r21	/* 16 bytes offset.  */
+#define rOFF24		r22	/* 24 bytes offset.  */
+#define rOFF32		r23	/* 24 bytes offset.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rWORD7		r30	/* next word in s1 */
+#define rWORD8		r31	/* next word in s2 */
+
+#define rWORD8SAVE	(-8)
+#define rWORD7SAVE	(-16)
+#define rOFF8SAVE	(-24)
+#define rOFF16SAVE	(-32)
+#define rOFF24SAVE	(-40)
+#define rOFF32SAVE	(-48)
+#define rSHRSAVE	(-56)
+#define rSHLSAVE	(-64)
+#define rWORD8SHIFTSAVE	(-72)
+#define rWORD2SHIFTSAVE	(-80)
+#define rWORD4SHIFTSAVE	(-88)
+#define rWORD6SHIFTSAVE	(-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD	ldbrx
+#else
+# define LD	ldx
+#endif
+
+	xor	r0, rSTR2, rSTR1
+	cmpldi	cr6, rN, 0
+	cmpldi	cr1, rN, 12
+	clrldi.	r0, r0, 61
+	clrldi	r12, rSTR1, 61
+	cmpldi	cr5, r12, 0
+	beq-	cr6, L(zeroLength)
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
+/* If less than 8 bytes or not aligned, use the unaligned
+   byte loop.  */
+	blt	cr1, L(bytealigned)
+	std	rWORD8, rWORD8SAVE(r1)
+	std	rWORD7, rWORD7SAVE(r1)
+	std	rOFF8, rOFF8SAVE(r1)
+	std	rOFF16, rOFF16SAVE(r1)
+	std	rOFF24, rOFF24SAVE(r1)
+	std	rOFF32, rOFF32SAVE(r1)
+	cfi_offset(rWORD8, rWORD8SAVE)
+	cfi_offset(rWORD7, rWORD7SAVE)
+	cfi_offset(rOFF8, rOFF8SAVE)
+	cfi_offset(rOFF16, rOFF16SAVE)
+	cfi_offset(rOFF24, rOFF24SAVE)
+	cfi_offset(rOFF32, rOFF32SAVE)
+
+	li	rOFF8,8
+	li	rOFF16,16
+	li	rOFF24,24
+	li	rOFF32,32
+
+	bne	L(unaligned)
+/* At this point we know both strings have the same alignment and the
+   compare length is at least 8 bytes.  r12 contains the low order
+   3 bits of rSTR1 and cr5 contains the result of the logical compare
+   of r12 to 0.  If r12 == 0 then we are already double word
+   aligned and can perform the DW aligned loop.
+
+   Otherwise we know the two strings have the same alignment (but not
+   yet DW).  So we force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
+   eliminate bits preceding the first byte.  Since we want to join the
+   normal (DW aligned) compare loop, starting at the second double word,
+   we need to adjust the length (rN) and special case the loop
+   versioning for the first DW. This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected register pair.  */
+	.align	4
+L(samealignment):
+	clrrdi	rSTR1, rSTR1, 3
+	clrrdi	rSTR2, rSTR2, 3
+	beq	cr5, L(DWaligned)
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	beq	L(dPs4)
+	mtctr	r0
+	bgt	cr1, L(dPs3)
+	beq	cr1, L(dPs2)
+
+/* Remainder is 8 */
+	.align	3
+L(dsP1):
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+/* Do something useful in this cycle since we have to branch anyway.  */
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(dP1e)
+/* Remainder is 16 */
+	.align	4
+L(dPs2):
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD2, rWORD6
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+/* Do something useful in this cycle since we have to branch anyway.  */
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	b	L(dP2e)
+/* Remainder is 24 */
+	.align	4
+L(dPs3):
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD2, rWORD6
+	cmpld	cr1, rWORD3, rWORD4
+	b	L(dP3e)
+/* Count is a multiple of 32, remainder is 0 */
+	.align	4
+L(dPs4):
+	mtctr	r0
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD2, rWORD6
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(dP4e)
+
+/* At this point we know both strings are double word aligned and the
+   compare length is at least 8 bytes.  */
+	.align	4
+L(DWaligned):
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	srdi	r0, rN, 5	/* Divide by 32 */
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	beq	L(dP4)
+	bgt	cr1, L(dP3)
+	beq	cr1, L(dP2)
+
+/* Remainder is 8 */
+	.align	4
+L(dP1):
+	mtctr	r0
+/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
+   (8-15 byte compare), we want to use only volatile registers.  This
+   means we can avoid restoring non-volatile registers since we did not
+   change any on the early exit path.  The key here is the non-early
+   exit path only cares about the condition code (cr5), not about which
+   register pair was used.  */
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
+	cmpld	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+L(dP1e):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
+
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	bne	cr1, L(dLcr1)
+	cmpld	cr5, rWORD7, rWORD8
+	bdnz	L(dLoop)
+	bne	cr6, L(dLcr6)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	.align	3
+L(dP1x):
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Remainder is 16 */
+	.align	4
+L(dP2):
+	mtctr	r0
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+L(dP2e):
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(dLcr6)
+	bne	cr5, L(dLcr5)
+	b	L(dLoop2)
+	.align	4
+L(dP2x):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	sldi.	r12, rN, 3
+	bne	cr6, L(dLcr6x)
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr1, L(dLcr1x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Remainder is 24 */
+	.align	4
+L(dP3):
+	mtctr	r0
+	LD	rWORD3, 0, rSTR1
+	LD	rWORD4, 0, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+L(dP3e):
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP3x)
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	bne	cr1, L(dLcr1)
+	bne	cr6, L(dLcr6)
+	b	L(dLoop1)
+/* Again we are on a early exit path (24-31 byte compare), we want to
+   only use volatile registers and avoid restoring non-volatile
+   registers.  */
+	.align	4
+L(dP3x):
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	sldi.	r12, rN, 3
+	bne	cr1, L(dLcr1x)
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	bne	cr6, L(dLcr6x)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	bne	cr7, L(dLcr7x)
+	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+/* Count is a multiple of 32, remainder is 0 */
+	.align	4
+L(dP4):
+	mtctr	r0
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+L(dP4e):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+	bne	cr1, L(dLcr1)
+	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+	.align	4
+L(dLoop):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+L(dLoop1):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+L(dLoop2):
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+L(dLoop3):
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	bne	cr1, L(dLcr1)
+	cmpld	cr7, rWORD1, rWORD2
+	bdnz	L(dLoop)
+
+L(dL4):
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+	cmpld	cr5, rWORD7, rWORD8
+L(d44):
+	bne	cr7, L(dLcr7)
+L(d34):
+	bne	cr1, L(dLcr1)
+L(d24):
+	bne	cr6, L(dLcr6)
+L(d14):
+	sldi.	r12, rN, 3
+	bne	cr5, L(dLcr5)
+L(d04):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
+	beq	L(duzeroLength)
+/* At this point we have a remainder of 1 to 7 bytes to compare.  Since
+   we are aligned it is safe to load the whole double word, and use
+   shift right double to eliminate bits beyond the compare length.  */
+L(d00):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr7, L(dLcr7x)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+	.align	4
+L(dLcr7):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr7x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr1):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr1x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr1
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr6):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr6x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+	.align	4
+L(dLcr5):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dLcr5x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 1
+	bgtlr	cr5
+	li	rRTN, -1
+	blr
+
+	.align	4
+L(bytealigned):
+	mtctr	rN
+
+/* We need to prime this loop.  This loop is swing modulo scheduled
+   to avoid pipe delays.  The dependent instruction latencies (load to
+   compare to conditional branch) is 2 to 3 cycles.  In this loop each
+   dispatch group ends in a branch and takes 1 cycle.  Effectively
+   the first iteration of the loop only serves to load operands and
+   branches based on compares are delayed until the next loop.
+
+   So we must precondition some registers and condition codes so that
+   we don't exit the loop early on the first iteration.  */
+
+	lbz	rWORD1, 0(rSTR1)
+	lbz	rWORD2, 0(rSTR2)
+	bdz	L(b11)
+	cmpld	cr7, rWORD1, rWORD2
+	lbz	rWORD3, 1(rSTR1)
+	lbz	rWORD4, 1(rSTR2)
+	bdz	L(b12)
+	cmpld	cr1, rWORD3, rWORD4
+	lbzu	rWORD5, 2(rSTR1)
+	lbzu	rWORD6, 2(rSTR2)
+	bdz	L(b13)
+	.align	4
+L(bLoop):
+	lbzu	rWORD1, 1(rSTR1)
+	lbzu	rWORD2, 1(rSTR2)
+	bne	cr7, L(bLcr7)
+
+	cmpld	cr6, rWORD5, rWORD6
+	bdz	L(b3i)
+
+	lbzu	rWORD3, 1(rSTR1)
+	lbzu	rWORD4, 1(rSTR2)
+	bne	cr1, L(bLcr1)
+
+	cmpld	cr7, rWORD1, rWORD2
+	bdz	L(b2i)
+
+	lbzu	rWORD5, 1(rSTR1)
+	lbzu	rWORD6, 1(rSTR2)
+	bne	cr6, L(bLcr6)
+
+	cmpld	cr1, rWORD3, rWORD4
+	bdnz	L(bLoop)
+
+/* We speculatively loading bytes before we have tested the previous
+   bytes.  But we must avoid overrunning the length (in the ctr) to
+   prevent these speculative loads from causing a segfault.  In this
+   case the loop will exit early (before the all pending bytes are
+   tested.  In this case we must complete the pending operations
+   before returning.  */
+L(b1i):
+	bne	cr7, L(bLcr7)
+	bne	cr1, L(bLcr1)
+	b	L(bx56)
+	.align	4
+L(b2i):
+	bne	cr6, L(bLcr6)
+	bne	cr7, L(bLcr7)
+	b	L(bx34)
+	.align	4
+L(b3i):
+	bne	cr1, L(bLcr1)
+	bne	cr6, L(bLcr6)
+	b	L(bx12)
+	.align	4
+L(bLcr7):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+L(bLcr1):
+	li	rRTN, 1
+	bgtlr	cr1
+	li	rRTN, -1
+	blr
+L(bLcr6):
+	li	rRTN, 1
+	bgtlr	cr6
+	li	rRTN, -1
+	blr
+
+L(b13):
+	bne	cr7, L(bx12)
+	bne	cr1, L(bx34)
+L(bx56):
+	sub	rRTN, rWORD5, rWORD6
+	blr
+	nop
+L(b12):
+	bne	cr7, L(bx12)
+L(bx34):
+	sub	rRTN, rWORD3, rWORD4
+	blr
+L(b11):
+L(bx12):
+	sub	rRTN, rWORD1, rWORD2
+	blr
+
+	.align	4
+L(zeroLength):
+	li	rRTN, 0
+	blr
+
+	.align	4
+/* At this point we know the strings have different alignment and the
+   compare length is at least 8 bytes.  r12 contains the low order
+   3 bits of rSTR1 and cr5 contains the result of the logical compare
+   of r12 to 0.  If r12 == 0 then rStr1 is double word
+   aligned and can perform the DWunaligned loop.
+
+   Otherwise we know that rSTR1 is not already DW aligned yet.
+   So we can force the string addresses to the next lower DW
+   boundary and special case this first DW using shift left to
+   eliminate bits preceding the first byte.  Since we want to join the
+   normal (DWaligned) compare loop, starting at the second double word,
+   we need to adjust the length (rN) and special case the loop
+   versioning for the first DW. This ensures that the loop count is
+   correct and the first DW (shifted) is in the expected resister pair.  */
+L(unaligned):
+	std	rSHL, rSHLSAVE(r1)
+	cfi_offset(rSHL, rSHLSAVE)
+	clrldi	rSHL, rSTR2, 61
+	beq	cr6, L(duzeroLength)
+	std	rSHR, rSHRSAVE(r1)
+	cfi_offset(rSHR, rSHRSAVE)
+	beq	cr5, L(DWunaligned)
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+/* Adjust the logical start of rSTR2 to compensate for the extra bits
+   in the 1st rSTR1 DW.  */
+	sub	rWORD8_SHIFT, rSTR2, r12
+/* But do not attempt to address the DW before that DW that contains
+   the actual start of rSTR2.  */
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+/* Compute the left/right shift counts for the unaligned rSTR2,
+   compensating for the logical (DW aligned) start of rSTR1.  */
+	clrldi	rSHL, rWORD8_SHIFT, 61
+	clrrdi	rSTR1, rSTR1, 3
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	sldi	rSHL, rSHL, 3
+	cmpld	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	sldi	rWORD6, r12, 3
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+	subfic	rSHR, rSHL, 64
+	srdi	r0, rN, 5	/* Divide by 32 */
+	andi.	r12, rN, 24	/* Get the DW remainder */
+/* We normally need to load 2 DWs to start the unaligned rSTR2, but in
+   this special case those bits may be discarded anyway.  Also we
+   must avoid loading a DW where none of the bits are part of rSTR2 as
+   this may cross a page boundary and cause a page fault.  */
+	li	rWORD8, 0
+	blt	cr5, L(dus0)
+	LD	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 8
+	sld	rWORD8, rWORD8, rSHL
+
+L(dus0):
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	srd	r12, rWORD2, rSHR
+	clrldi	rN, rN, 61
+	beq	L(duPs4)
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	bgt	cr1, L(duPs3)
+	beq	cr1, L(duPs2)
+
+/* Remainder is 8 */
+	.align	4
+L(dusP1):
+	sld	rWORD8_SHIFT, rWORD2, rSHL
+	sld	rWORD7, rWORD1, rWORD6
+	sld	rWORD8, rWORD8, rWORD6
+	bge	cr7, L(duP1e)
+/* At this point we exit early with the first double word compare
+   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+   how we handle the remaining bytes.  */
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+/* Remainder is 16 */
+	.align	4
+L(duPs2):
+	sld	rWORD6_SHIFT, rWORD2, rSHL
+	sld	rWORD5, rWORD1, rWORD6
+	sld	rWORD6, rWORD8, rWORD6
+	b	L(duP2e)
+/* Remainder is 24 */
+	.align	4
+L(duPs3):
+	sld	rWORD4_SHIFT, rWORD2, rSHL
+	sld	rWORD3, rWORD1, rWORD6
+	sld	rWORD4, rWORD8, rWORD6
+	b	L(duP3e)
+/* Count is a multiple of 32, remainder is 0 */
+	.align	4
+L(duPs4):
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	sld	rWORD1, rWORD1, rWORD6
+	sld	rWORD2, rWORD8, rWORD6
+	b	L(duP4e)
+
+/* At this point we know rSTR1 is double word aligned and the
+   compare length is at least 8 bytes.  */
+	.align	4
+L(DWunaligned):
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	clrrdi	rSTR2, rSTR2, 3
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	srdi	r0, rN, 5	/* Divide by 32 */
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	andi.	r12, rN, 24	/* Get the DW remainder */
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
+	sldi	rSHL, rSHL, 3
+	LD	rWORD6, 0, rSTR2
+	LD	rWORD8, rOFF8, rSTR2
+	addi	rSTR2, rSTR2, 8
+	cmpldi	cr1, r12, 16
+	cmpldi	cr7, rN, 32
+	clrldi	rN, rN, 61
+	subfic	rSHR, rSHL, 64
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	beq	L(duP4)
+	mtctr	r0
+	bgt	cr1, L(duP3)
+	beq	cr1, L(duP2)
+
+/* Remainder is 8 */
+	.align	4
+L(duP1):
+	srd	r12, rWORD8, rSHR
+	LD	rWORD7, 0, rSTR1
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP1x)
+L(duP1e):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	bne	cr5, L(duLcr5)
+	or	rWORD4, r12, rWORD2_SHIFT
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
+	cmpld	cr6, rWORD5, rWORD6
+	b	L(duLoop3)
+	.align	4
+/* At this point we exit early with the first double word compare
+   complete and remainder of 0 to 7 bytes.  See L(du14) for details on
+   how we handle the remaining bytes.  */
+L(duP1x):
+	cmpld	cr5, rWORD7, rWORD8
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+/* Remainder is 16 */
+	.align	4
+L(duP2):
+	srd	r0, rWORD8, rSHR
+	LD	rWORD5, 0, rSTR1
+	or	rWORD6, r0, rWORD6_SHIFT
+	sld	rWORD6_SHIFT, rWORD8, rSHL
+L(duP2e):
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP2x)
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	cmpld	cr1, rWORD3, rWORD4
+	b	L(duLoop2)
+	.align	4
+L(duP2x):
+	cmpld	cr5, rWORD7, rWORD8
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+
+/* Remainder is 24 */
+	.align	4
+L(duP3):
+	srd	r12, rWORD8, rSHR
+	LD	rWORD3, 0, rSTR1
+	sld	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
+L(duP3e):
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP3x)
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	cmpld	cr7, rWORD1, rWORD2
+	b	L(duLoop1)
+	.align	4
+L(duP3x):
+	addi	rSTR1, rSTR1, 16
+	addi	rSTR2, rSTR2, 16
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	b	L(dutrim)
+
+/* Count is a multiple of 32, remainder is 0 */
+	.align	4
+L(duP4):
+	mtctr	r0
+	srd	r0, rWORD8, rSHR
+	LD	rWORD1, 0, rSTR1
+	sld	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
+L(duP4e):
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
+	cmpld	cr7, rWORD1, rWORD2
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	cmpld	cr5, rWORD7, rWORD8
+	bdz	L(du24)		/* Adjust CTR as we start with +4 */
+/* This is the primary loop */
+	.align	4
+L(duLoop):
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	srd	r0, rWORD2, rSHR
+	sld	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+L(duLoop1):
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	srd	r12, rWORD4, rSHR
+	sld	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+L(duLoop2):
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
+	cmpld	cr5, rWORD7, rWORD8
+	bne	cr7, L(duLcr7)
+	srd	r0, rWORD6, rSHR
+	sld	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+L(duLoop3):
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
+	cmpld	cr7, rWORD1, rWORD2
+	bne	cr1, L(duLcr1)
+	srd	r12, rWORD8, rSHR
+	sld	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	bdnz	L(duLoop)
+
+L(duL4):
+	cmpld	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	cmpld	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	cmpld	cr5, rWORD7, rWORD8
+L(du44):
+	bne	cr7, L(duLcr7)
+L(du34):
+	bne	cr1, L(duLcr1)
+L(du24):
+	bne	cr6, L(duLcr6)
+L(du14):
+	sldi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+/* At this point we have a remainder of 1 to 7 bytes to compare.  We use
+   shift right double to eliminate bits beyond the compare length.
+
+   However it may not be safe to load rWORD2 which may be beyond the
+   string length. So we compare the bit length of the remainder to
+   the right shift count (rSHR). If the bit count is less than or equal
+   we do not need to load rWORD2 (all significant bits are already in
+   rWORD8_SHIFT).  */
+	cmpld	cr7, rN, rSHR
+	beq	L(duZeroReturn)
+	li	r0, 0
+	ble	cr7, L(dutrim)
+	LD	rWORD2, rOFF8, rSTR2
+	srd	r0, rWORD2, rSHR
+	.align	4
+L(dutrim):
+	LD	rWORD1, rOFF8, rSTR1
+	ld	rWORD8, -8(r1)
+	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
+	or	rWORD2, r0, rWORD8_SHIFT
+	ld	rWORD7, rWORD7SAVE(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	srd	rWORD1, rWORD1, rN
+	srd	rWORD2, rWORD2, rN
+	ld	rSHR, rSHRSAVE(r1)
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	li	rRTN, 0
+	cmpld	cr7, rWORD1, rWORD2
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	beq	cr7, L(dureturn24)
+	li	rRTN, 1
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	bgtlr	cr7
+	li	rRTN, -1
+	blr
+	.align	4
+L(duLcr7):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr7, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr1):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr1, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr6):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr6, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+	.align	4
+L(duLcr5):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	li	rRTN, 1
+	bgt	cr5, L(dureturn29)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	li	rRTN, -1
+	b	L(dureturn27)
+
+	.align	3
+L(duZeroReturn):
+	li	rRTN, 0
+	.align	4
+L(dureturn):
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+L(dureturn29):
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
+L(dureturn27):
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+L(dureturn24):
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	blr
+
+L(duzeroLength):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
+	li	rRTN, 0
+	blr
+
+END (MEMCMP)
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp, bcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcpy.S
new file mode 100644
index 0000000000..e08993cbc3
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -0,0 +1,430 @@
+/* Optimized memcpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+   Returns 'dst'.  */
+
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#define dst 11		/* Use r11 so r3 kept unchanged.  */
+#define src 4
+#define cnt 5
+
+	.machine power7
+EALIGN (MEMCPY, 5, 0)
+	CALL_MCOUNT 3
+
+	cmpldi	cr1,cnt,31
+	neg	0,3
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				    code.  */
+
+/* Align copies using VSX instructions to quadword. It is to avoid alignment
+   traps when memcpy is used on non-cacheable memory (for instance, memory
+   mapped I/O).  */
+	andi.	10,3,15
+	clrldi	11,4,60
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	dst,3
+	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
+
+	mtocrf	0x01,0
+	clrldi	0,0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+	bf	28,16f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+16:
+	subf	cnt,0,cnt
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,cnt
+	srdi	12,cnt,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	mtctr	12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+L(aligned_128loop):
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	stxvd2x	6,0,dst
+	addi	src,src,64
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	addi	dst,dst,64
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	addi	src,src,64
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	addi	dst,dst,64
+	bdnz	L(aligned_128head)
+
+L(aligned_tail):
+	mtocrf	0x01,cnt
+	bf	25,32f
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	addi	src,src,64
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	addi	dst,dst,64
+32:
+	bf	26,16f
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	addi	src,src,32
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	addi	dst,dst,32
+16:
+	bf	27,8f
+	lxvd2x	6,0,src
+	addi	src,src,16
+	stxvd2x	6,0,dst
+	addi	dst,dst,16
+8:
+	bf	28,4f
+	ld	6,0(src)
+	addi	src,src,8
+	std     6,0(dst)
+	addi	dst,dst,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw     6,0(dst)
+	bf      30,L(tail5)
+	lhz     7,4(src)
+	sth     7,4(dst)
+	bflr	31
+	lbz     8,6(src)
+	stb     8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	mr	dst,3
+	cmpldi	cr6,cnt,8
+	mtocrf	0x01,cnt
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	andi.	0,8,3
+	cmpldi	cr1,cnt,16
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+2:
+	bf	30,1f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,cnt,16
+	mtocrf	0x01,cnt
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	lwz	8,8(src)
+	stw	7,4(dst)
+	lwz	6,12(src)
+	addi	src,src,16
+	stw	8,8(dst)
+	stw	6,12(dst)
+	addi	dst,dst,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	6,0(src)
+	sth	6,0(dst)
+	bflr	31
+	lbz	7,2(src)
+	stb	7,2(dst)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(src)
+	stb	6,4(dst)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(src)
+	stb	6,0(dst)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	stw	7,4(dst)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+	bf	28,0f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+0:
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	10,cnt,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,src
+#else
+	lvsl	5,0,src
+#endif
+	lvx	3,0,src
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	src,src,16
+	stvx	6,0,dst
+	addi	dst,dst,16
+	vor	3,4,4
+	clrrdi	0,src,60
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,src,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	src,src,32
+	stvx	6,0,dst
+	stvx	10,dst,6
+	addi	dst,dst,32
+	bdnz	L(unaligned_loop)
+
+	clrrdi	0,src,60
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,cnt
+	beqlr	cr1
+
+	add	src,src,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr
+
+END_GEN_TB (MEMCPY,TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memmove.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memmove.S
new file mode 100644
index 0000000000..4c0f7c3571
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -0,0 +1,835 @@
+/* Optimized memmove implementation for PowerPC64/POWER7.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
+
+   This optimization check if memory 'dest'  overlaps with 'src'. If it does
+   not then it calls an optimized memcpy call (similar to memcpy for POWER7,
+   embedded here to gain some cycles).
+   If source and destiny overlaps, a optimized backwards memcpy is used
+   instead.  */
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+	.machine power7
+EALIGN (MEMMOVE, 5, 0)
+	CALL_MCOUNT 3
+
+L(_memmove):
+	subf    r9,r4,r3
+	cmpld   cr7,r9,r5
+	blt	cr7,L(memmove_bwd)
+
+	cmpldi	cr1,r5,31
+	neg	0,3
+	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
+				       code.  */
+
+	andi.	10,3,15
+	clrldi	11,4,60
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
+
+	mr	r11,3
+	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)
+
+	mtocrf	0x01,0
+	clrldi	0,0,60
+
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,16f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,r5
+	srdi	12,r5,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	mtctr	12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+L(aligned_128loop):
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	stxvd2x	6,0,r11
+	addi	r4,r4,64
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	addi	r11,r11,64
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	addi	r4,r4,64
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	addi	r11,r11,64
+	bdnz	L(aligned_128head)
+
+L(aligned_tail):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	lxvd2x	8,r4,7
+	lxvd2x	9,r4,8
+	addi	r4,r4,64
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	stxvd2x	8,r11,7
+	stxvd2x	9,r11,8
+	addi	r11,r11,64
+32:
+	bf	26,16f
+	lxvd2x	6,0,r4
+	lxvd2x	7,r4,6
+	addi	r4,r4,32
+	stxvd2x	6,0,r11
+	stxvd2x	7,r11,6
+	addi	r11,r11,32
+16:
+	bf	27,8f
+	lxvd2x	6,0,r4
+	addi	r4,r4,16
+	stxvd2x	6,0,r11
+	addi	r11,r11,16
+8:
+	bf	28,4f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std     6,0(r11)
+	addi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw     6,0(r11)
+	bf      30,L(tail5)
+	lhz     7,4(r4)
+	sth     7,4(r11)
+	bflr	31
+	lbz     8,6(r4)
+	stb     8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	mr	r11,3
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	andi.	0,8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	lwz	8,8(r4)
+	stw	7,4(r11)
+	lwz	6,12(r4)
+	addi	r4,r4,16
+	stw	8,8(r11)
+	stw	6,12(r11)
+	addi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	6,0(r4)
+	sth	6,0(r11)
+	bflr	31
+	lbz	7,2(r4)
+	stb	7,2(r11)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(r4)
+	stb	6,4(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(r4)
+	stb	6,0(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	stw	6,0(r11)
+	stw	7,4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st r11 quadword.  */
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* DST is not quadword aligned, get it aligned.  */
+
+	mtocrf	0x01,0
+	subf	r5,0,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	6,0(r4)
+	addi	r4,r4,1
+	stb	6,0(r11)
+	addi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	6,0(r4)
+	addi	r4,r4,2
+	sth	6,0(r11)
+	addi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	6,0(r4)
+	addi	r4,r4,4
+	stw	6,0(r11)
+	addi	r11,r11,4
+8:
+	bf	28,0f
+	ld	6,0(r4)
+	addi	r4,r4,8
+	std	6,0(r11)
+	addi	r11,r11,8
+0:
+	srdi	9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	10,r5,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,r4
+#else
+	lvsl	5,0,r4
+#endif
+	lvx	3,0,r4
+	li	0,0
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	r4,r4,16
+	stvx	6,0,r11
+	addi	r11,r11,16
+	vor	3,4,4
+	clrrdi	0,r4,60
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,r4,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,r4,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	r4,r4,32
+	stvx	6,0,r11
+	stvx	10,r11,6
+	addi	r11,r11,32
+	bdnz	L(unaligned_loop)
+
+	clrrdi	0,r4,60
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	6,0(r4)
+	lwz	7,4(r4)
+	addi	r4,r4,8
+	stw	6,0(r11)
+	stw	7,4(r11)
+	addi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(r4)
+	stw	6,0(r11)
+	bf	30,L(tail5)
+	lhz	7,4(r4)
+	sth	7,4(r11)
+	bflr	31
+	lbz	8,6(r4)
+	stb	8,6(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	/* Start to memcpy backward implementation: the algorith first check if
+	   src and dest have the same alignment and if it does align both to 16
+	   bytes and copy using VSX instructions.
+	   If does not, align dest to 16 bytes and use VMX (altivec) instruction
+	   to read two 16 bytes at time, shift/permute the bytes read and write
+	   aligned to dest.  */
+L(memmove_bwd):
+	cmpldi	cr1,r5,31
+	/* Copy is done backwards: update the pointers and check alignment.  */
+	add	r11,r3,r5
+	add	r4,r4,r5
+	mr	r0,r11
+	ble	cr1, L(copy_LT_32_bwd)  /* If move < 32 bytes use short move
+				           code.  */
+
+	andi.	r10,r11,15	    /* Check if r11 is aligned to 16 bytes  */
+	clrldi	r9,r4,60	    /* Check if r4 is aligned to 16 bytes  */
+	cmpld	cr6,r10,r9	    /* SRC and DST alignments match?  */
+
+	bne     cr6,L(copy_GE_32_unaligned_bwd)
+	beq     L(aligned_copy_bwd)
+
+	mtocrf	0x01,r0
+	clrldi	r0,r0,60
+
+/* Get the DST and SRC aligned to 16 bytes.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,16f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+16:
+	subf	r5,0,r5
+
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy_bwd):
+	li	r6,-16
+	li	r7,-32
+	li	r8,-48
+	li	r9,-64
+	mtocrf	0x02,r5
+	srdi	r12,r5,7
+	cmpdi	r12,0
+	beq	L(aligned_tail_bwd)
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	mtctr	12
+	b	L(aligned_128loop_bwd)
+
+	.align  4
+L(aligned_128head_bwd):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+L(aligned_128loop_bwd):
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	stxvd2x	v6,r11,r6
+	subi	r4,r4,64
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,7
+	subi	r11,r11,64
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	subi	r4,r4,64
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	subi	r11,r11,64
+	bdnz	L(aligned_128head_bwd)
+
+L(aligned_tail_bwd):
+	mtocrf	0x01,r5
+	bf	25,32f
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	lxvd2x	v8,r4,r8
+	lxvd2x	v9,r4,r9
+	subi	r4,r4,64
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	stxvd2x	v8,r11,r8
+	stxvd2x	v9,r11,r9
+	subi	r11,r11,64
+32:
+	bf	26,16f
+	lxvd2x	v6,r4,r6
+	lxvd2x	v7,r4,r7
+	subi	r4,r4,32
+	stxvd2x	v6,r11,r6
+	stxvd2x	v7,r11,r7
+	subi	r11,r11,32
+16:
+	bf	27,8f
+	lxvd2x	v6,r4,r6
+	subi	r4,r4,16
+	stxvd2x	v6,r11,r6
+	subi	r11,r11,16
+8:
+	bf	28,4f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std     r6,-8(r11)
+	subi	r11,r11,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw     r6,-4(r11)
+	bf      30,L(tail5_bwd)
+	lhz     r7,-6(r4)
+	sth     r7,-6(r11)
+	bflr	31
+	lbz     r8,-7(r4)
+	stb     r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32_bwd):
+	cmpldi	cr6,r5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(copy_LE_8_bwd)
+
+	/* At least 9 bytes to go.  */
+	neg	r8,r4
+	andi.	r0,r8,3
+	cmpldi	cr1,r5,16
+	beq	L(copy_LT_32_aligned_bwd)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	r5,0,r5
+2:
+	bf	30,1f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+1:
+	bf	31,L(end_4bytes_alignment_bwd)
+	lbz	6,-1(r4)
+	subi	r4,r4,1
+	stb	6,-1(r11)
+	subi	r11,r11,1
+
+	.align	4
+L(end_4bytes_alignment_bwd):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(copy_LT_32_aligned_bwd):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	stw	r6,-4(r11)
+	lwz	r8,-12(r4)
+	stw	r7,-8(r11)
+	lwz	r6,-16(r4)
+	subi	r4,r4,16
+	stw	r8,-12(r11)
+	stw	r6,-16(r11)
+	subi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4_bwd)
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4_bwd):
+	bf	29,L(tail2_bwd)
+	lwz	6,-4(r4)
+	stw	6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	7,-6(r4)
+	sth	7,-6(r11)
+	bflr	31
+	lbz	8,-7(r4)
+	stb	8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2_bwd):
+	bf	30,1f
+	lhz	6,-2(r4)
+	sth	6,-2(r11)
+	bflr	31
+	lbz	7,-3(r4)
+	stb	7,-3(r11)
+	blr
+
+	.align	4
+L(tail5_bwd):
+	bflr	31
+	lbz	6,-5(r4)
+	stb	6,-5(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	6,-1(r4)
+	stb	6,-1(r11)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8_bwd):
+	bne	cr6,L(tail4_bwd)
+
+	/* Though we could've used ld/std here, they are still
+	   slow for unaligned cases.  */
+	lwz	6,-8(r4)
+	lwz	7,-4(r4)
+	stw	6,-8(r11)
+	stw	7,-4(r11)
+	blr
+
+
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned_bwd):
+	andi.	r10,r11,15      /* Check alignment of DST against 16 bytes..  */
+	srdi	r9,r5,4		/* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont_bwd)
+
+	/* DST is not quadword aligned and r10 holds the address masked to
+           compare alignments.  */
+	mtocrf	0x01,r10
+	subf	r5,r10,r5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:
+	bf	31,2f
+	lbz	r6,-1(r4)
+	subi	r4,r4,1
+	stb	r6,-1(r11)
+	subi	r11,r11,1
+2:
+	bf	30,4f
+	lhz	r6,-2(r4)
+	subi	r4,r4,2
+	sth	r6,-2(r11)
+	subi	r11,r11,2
+4:
+	bf	29,8f
+	lwz	r6,-4(r4)
+	subi	r4,r4,4
+	stw	r6,-4(r11)
+	subi	r11,r11,4
+8:
+	bf	28,0f
+	ld	r6,-8(r4)
+	subi	r4,r4,8
+	std	r6,-8(r11)
+	subi	r11,r11,8
+0:
+	srdi	r9,r5,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont_bwd):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	r10,r5,60
+	li	r6,-16	      /* Index for 16-bytes offsets.  */
+	li	r7,-32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,10,0
+	srdi	r8,r5,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,r9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	v5,r0,r4
+#else
+	lvsl	v5,r0,r4
+#endif
+	lvx	v3,0,r4
+	li	r0,0
+	bf	31,L(setup_unaligned_loop_bwd)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	subi	r4,r4,16
+	stvx	v6,r11,r6
+	subi	r11,r11,16
+	vor	v3,v4,v4
+	clrrdi	r0,r4,60
+
+L(setup_unaligned_loop_bwd):
+	mtctr	r8
+	ble	cr6,L(end_unaligned_loop_bwd)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop_bwd):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	v4,r4,r6
+#ifdef __LITTLE_ENDIAN__
+	vperm	v6,v3,v4,v5
+#else
+	vperm	v6,v4,v3,v5
+#endif
+	lvx	v3,r4,r7
+#ifdef __LITTLE_ENDIAN__
+	vperm	v10,v4,v3,v5
+#else
+	vperm	v10,v3,v4,v5
+#endif
+	subi	r4,r4,32
+	stvx	v6,r11,r6
+	stvx	v10,r11,r7
+	subi	r11,r11,32
+	bdnz	L(unaligned_loop_bwd)
+
+	clrrdi	r0,r4,60
+
+	.align	4
+L(end_unaligned_loop_bwd):
+
+	/* Check for tail bytes.  */
+	mtocrf	0x01,r5
+	beqlr	cr1
+
+	add	r4,r4,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+	/* Copy 8 bytes.  */
+	bf	28,4f
+	lwz	r6,-4(r4)
+	lwz	r7,-8(r4)
+	subi	r4,r4,8
+	stw	r6,-4(r11)
+	stw	r7,-8(r11)
+	subi	r11,r11,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2_bwd)
+	lwz	r6,-4(r4)
+	stw	r6,-4(r11)
+	bf	30,L(tail5_bwd)
+	lhz	r7,-6(r4)
+	sth	r7,-6(r11)
+	bflr	31
+	lbz	r8,-7(r4)
+	stb	r8,-7(r11)
+	/* Return original DST pointer.  */
+	blr
+END_GEN_TB (MEMMOVE, TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+   Implemented in this file to avoid linker create a stub function call
+   in the branch to '_memmove'.  */
+ENTRY (__bcopy)
+	mr	r6,r3
+	mr	r3,r4
+	mr	r4,r6
+	b	L(_memmove)
+END (__bcopy)
+weak_alias (__bcopy, bcopy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/mempcpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/mempcpy.S
new file mode 100644
index 0000000000..4e15d1e40c
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/mempcpy.S
@@ -0,0 +1,472 @@
+/* Optimized mempcpy implementation for POWER7.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+
+/* __ptr_t [r3] __mempcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+    Returns 'dst' + 'len'.  */
+
+#ifndef MEMPCPY
+# define MEMPCPY __mempcpy
+#endif
+	.machine  power7
+EALIGN (MEMPCPY, 5, 0)
+	CALL_MCOUNT 3
+
+	cmpldi	cr1,5,31
+	neg	0,3
+	std	3,-16(1)
+	std	31,-8(1)
+	cfi_offset(31,-8)
+	ble	cr1,L(copy_LT_32)   /* If move < 32 bytes use short move
+				       code.  */
+
+	andi.	11,3,7	      /* Check alignment of DST.  */
+
+
+	clrldi	10,4,61	      /* Check alignment of SRC.  */
+	cmpld	cr6,10,11     /* SRC and DST alignments match?  */
+	mr	12,4
+	mr	31,5
+	bne	cr6,L(copy_GE_32_unaligned)
+
+	srdi	9,5,3	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_aligned_cont)
+
+	clrldi	0,0,61
+	mtcrf	0x01,0
+	subf	31,0,5
+
+	/* Get the SRC aligned to 8 bytes.  */
+
+1:	bf	31,2f
+	lbz	6,0(12)
+	addi	12,12,1
+	stb	6,0(3)
+	addi	3,3,1
+2:	bf	30,4f
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+4:	bf	29,0f
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+0:
+	clrldi	10,12,61      /* Check alignment of SRC again.  */
+	srdi	9,31,3	      /* Number of full doublewords remaining.  */
+
+L(copy_GE_32_aligned_cont):
+
+	clrldi	11,31,61
+	mtcrf	0x01,9
+
+	srdi	8,31,5
+	cmpldi	cr1,9,4
+	cmpldi	cr6,11,0
+	mr	11,12
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf	30,1f
+	ld	6,0(12)
+	ld	7,8(12)
+	addi	11,12,16
+	mtctr	8
+	std	6,0(3)
+	std	7,8(3)
+	addi	10,3,16
+	bf	31,4f
+	ld	0,16(12)
+	std	0,16(3)
+	blt	cr1,3f
+	addi	11,12,24
+	addi	10,3,24
+	b	4f
+
+	.align	4
+1:	/* Copy 1 doubleword and set the counter.  */
+	mr	10,3
+	mtctr	8
+	bf	31,4f
+	ld	6,0(12)
+	addi	11,12,8
+	std	6,0(3)
+	addi	10,3,8
+
+	/* Main aligned copy loop. Copies 32-bytes at a time.  */
+	.align	4
+4:
+	ld	6,0(11)
+	ld	7,8(11)
+	ld	8,16(11)
+	ld	0,24(11)
+	addi	11,11,32
+
+	std	6,0(10)
+	std	7,8(10)
+	std	8,16(10)
+	std	0,24(10)
+	addi	10,10,32
+	bdnz	4b
+3:
+
+	/* Check for tail bytes.  */
+	rldicr	0,31,0,60
+	mtcrf	0x01,31
+	beq	cr6,0f
+
+.L9:
+	add	3,3,0
+	add	12,12,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+2:	/* Copy 2 bytes.  */
+	bf	30,1f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return DST + LEN pointer.  */
+	ld	31,-8(1)
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+	/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	cmpldi	cr6,5,8
+	mr	12,4
+	mtcrf	0x01,5
+	ble	cr6,L(copy_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	8,4
+	clrrdi	11,4,2
+	andi.	0,8,3
+	cmpldi	cr1,5,16
+	mr	10,5
+	beq	L(copy_LT_32_aligned)
+
+	/* Force 4-bytes alignment for SRC.  */
+	mtocrf  0x01,0
+	subf	10,0,5
+2:	bf	30,1f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+1:	bf	31,L(end_4bytes_alignment)
+
+	lbz	6,0(12)
+	addi	12,12,1
+	stb	6,0(3)
+	addi	3,3,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,10,16
+	mtcrf	0x01,10
+
+L(copy_LT_32_aligned):
+	/* At least 6 bytes to go, and SRC is word-aligned.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	lwz	6,0(12)
+	lwz	7,4(12)
+	stw	6,0(3)
+	lwz	8,8(12)
+	stw	7,4(3)
+	lwz	6,12(12)
+	addi	12,12,16
+	stw	8,8(3)
+	stw	6,12(3)
+	addi	3,3,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	lwz	6,0(12)
+	lwz	7,4(12)
+	addi	12,12,8
+	stw	6,0(3)
+	stw	7,4(3)
+	addi	3,3,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	lhz	6,0(12)
+	sth	6,0(3)
+	bf	31,0f
+	lbz	7,2(12)
+	stb	7,2(3)
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+	.align	4
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return DST + LEN pointer.  */
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,4f
+
+	/* Though we could've used ld/std here, they are still
+	slow for unaligned cases.  */
+
+	lwz	6,0(4)
+	lwz	7,4(4)
+	stw	6,0(3)
+	stw	7,4(3)
+	ld	3,-16(1)      /* Return DST + LEN pointer.  */
+	add	3,3,5
+	blr
+
+	.align	4
+4:	/* Copies 4~7 bytes.  */
+	bf	29,2b
+
+	lwz	6,0(4)
+	stw	6,0(3)
+	bf	30,5f
+	lhz	7,4(4)
+	sth	7,4(3)
+	bf	31,0f
+	lbz	8,6(4)
+	stb	8,6(3)
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+	.align	4
+5:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,4(4)
+	stb	6,4(3)
+
+0:	/* Return DST + LEN pointer.  */
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
+	the data, allowing for aligned DST stores.  */
+	.align	4
+L(copy_GE_32_unaligned):
+	clrldi	0,0,60	      /* Number of bytes until the 1st
+				 quadword.  */
+	andi.	11,3,15	      /* Check alignment of DST (against
+				 quadwords).  */
+	srdi	9,5,4	      /* Number of full quadwords remaining.  */
+
+	beq	L(copy_GE_32_unaligned_cont)
+
+	/* SRC is not quadword aligned, get it aligned.  */
+
+	mtcrf	0x01,0
+	subf	31,0,5
+
+	/* Vector instructions work best when proper alignment (16-bytes)
+	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
+1:	/* Copy 1 byte.  */
+	bf	31,2f
+
+	lbz	6,0(12)
+	addi	12,12,1
+	stb	6,0(3)
+	addi	3,3,1
+2:	/* Copy 2 bytes.  */
+	bf	30,4f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+4:	/* Copy 4 bytes.  */
+	bf	29,8f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+8:	/* Copy 8 bytes.  */
+	bf	28,0f
+
+	ld	6,0(12)
+	addi	12,12,8
+	std	6,0(3)
+	addi	3,3,8
+0:
+	clrldi	10,12,60      /* Check alignment of SRC.  */
+	srdi	9,31,4	      /* Number of full quadwords remaining.  */
+
+	/* The proper alignment is present, it is OK to copy the bytes now.  */
+L(copy_GE_32_unaligned_cont):
+
+	/* Setup two indexes to speed up the indexed vector operations.  */
+	clrldi	11,31,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
+	li	7,32	      /* Index for 32-bytes offsets.  */
+	cmpldi	cr1,11,0
+	srdi	8,31,5	      /* Setup the loop counter.  */
+	mr	10,3
+	mr	11,12
+	mtcrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
+	lvx	3,0,12
+	bf	31,L(setup_unaligned_loop)
+
+	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
+	lvx	4,12,6
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
+	addi	11,12,16
+	addi	10,3,16
+	stvx	6,0,3
+	vor	3,4,4
+
+L(setup_unaligned_loop):
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)
+
+	/* Copy 32 bytes at a time using vector instructions.  */
+	.align	4
+L(unaligned_loop):
+
+	/* Note: vr6/vr10 may contain data that was already copied,
+	but in order to get proper alignment, we may have to copy
+	some portions again. This is faster than having unaligned
+	vector instructions though.  */
+
+	lvx	4,11,6	      /* vr4 = r11+16.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
+	lvx	3,11,7	      /* vr3 = r11+32.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
+	addi	11,11,32
+	stvx	6,0,10
+	stvx	10,10,6
+	addi	10,10,32
+
+	bdnz	L(unaligned_loop)
+
+	.align	4
+L(end_unaligned_loop):
+
+	/* Check for tail bytes.  */
+	rldicr	0,31,0,59
+	mtcrf	0x01,31
+	beq	cr1,0f
+
+	add	3,3,0
+	add	12,12,0
+
+	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	lwz	6,0(12)
+	lwz	7,4(12)
+	addi	12,12,8
+	stw	6,0(3)
+	stw	7,4(3)
+	addi	3,3,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	lwz	6,0(12)
+	addi	12,12,4
+	stw	6,0(3)
+	addi	3,3,4
+2:	/* Copy 2~3 bytes.  */
+	bf	30,1f
+
+	lhz	6,0(12)
+	addi	12,12,2
+	sth	6,0(3)
+	addi	3,3,2
+1:	/* Copy 1 byte.  */
+	bf	31,0f
+
+	lbz	6,0(12)
+	stb	6,0(3)
+0:	/* Return DST + LEN pointer.  */
+	ld	31,-8(1)
+	ld	3,-16(1)
+	add	3,3,5
+	blr
+
+END_GEN_TB (MEMPCPY,TB_TOCLESS)
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memrchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memrchr.S
new file mode 100644
index 0000000000..4276768915
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memrchr.S
@@ -0,0 +1,201 @@
+/* Optimized memrchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] memrchr (char *s [r3], int byte [r4], int size [r5])  */
+
+#ifndef MEMRCHR
+# define MEMRCHR __memrchr
+#endif
+	.machine  power7
+ENTRY (MEMRCHR)
+	CALL_MCOUNT 3
+	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	neg	r0,r7
+	addi	r7,r7,-1
+	mr	r10,r3
+	clrrdi	r6,r7,7
+	li	r9,3<<5
+	dcbt	r9,r6,8       /* Stream hint, decreasing addresses.  */
+
+	/* Replicate BYTE to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi  r4,r4,32,0
+	li	r6,-8
+	li	r9,-1
+	rlwinm	r0,r0,3,26,28 /* Calculate padding.  */
+	clrrdi	r8,r7,3
+	srd	r9,r9,r0
+	cmpldi	r5,32
+	clrrdi	r0,r10,3
+	ble	L(small_range)
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8      /* Load reversed doubleword from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+	bf	28,L(loop_setup)
+
+	/* Handle DWORD2 of pair.  */
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,r8,r6
+#else
+	ldbrx	r12,r8,r6
+#endif
+	addi	r8,r8,-8
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+
+L(loop_setup):
+	/* The last dword we want to read in the loop below is the one
+	   containing the first byte of the string, ie. the dword at
+	   s & ~7, or r0.  The first dword read is at r8 - 8, we
+	   read 2 * cnt dwords, so the last dword read will be at
+	   r8 - 8 - 16 * cnt + 8.  Solving for cnt gives
+	   cnt = (r8 - r0) / 16  */
+	sub	r5,r8,r0
+	addi	r8,r8,-8
+	srdi	r9,r5,4       /* Number of loop iterations.  */
+	mtctr	r9	      /* Setup the counter.  */
+
+	/* Main loop to look for BYTE backwards in the string.
+	   FIXME: Investigate whether 32 byte align helps with this
+	   9 instruction loop.  */
+	.align	5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the byte-checking process for bigger strings.  */
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+	ldx	r11,r8,r6
+#else
+	ldbrx	r12,0,r8
+	ldbrx	r11,r8,r6
+#endif
+	cmpb	r3,r12,r4
+	cmpb	r9,r11,r4
+	or	r5,r9,r3      /* Merge everything in one doubleword.  */
+	cmpldi	cr7,r5,0
+	bne	cr7,L(found)
+	addi	r8,r8,-16
+	bdnz	L(loop)
+
+	/* We may have one more word to read.  */
+	cmpld	r8,r0
+	bnelr
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+	blr
+
+	.align	4
+L(found):
+	/* OK, one (or both) of the dwords contains BYTE.  Check
+	   the first dword.  */
+	cmpldi	cr6,r3,0
+	bne	cr6,L(done)
+
+	/* BYTE must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r3 so we can calculate the
+	   pointer.  */
+
+	mr	r3,r9
+	addi	r8,r8,-8
+
+	/* r3 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as BYTE in the original
+	   word from the string.  Use that to calculate the pointer.
+	   We need to make sure BYTE is *before* the end of the
+	   range.  */
+L(done):
+	cntlzd	r9,r3	      /* Count leading zeros before the match.  */
+	cmpld	r8,r0         /* Are we on the last word?  */
+	srdi	r6,r9,3	      /* Convert leading zeros to bytes.  */
+	addi	r0,r6,-7
+	sub	r3,r8,r0
+	cmpld	cr7,r3,r10
+	bnelr
+	bgelr	cr7
+	li	r3,0
+	blr
+
+	.align	4
+L(null):
+	li	r3,0
+	blr
+
+/* Deals with size <= 32.  */
+	.align	4
+L(small_range):
+	cmpldi	r5,0
+	beq	L(null)
+
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8      /* Load reversed doubleword from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+
+	/* Are we done already?  */
+	cmpld	r8,r0
+	addi	r8,r8,-8
+	beqlr
+
+	.align	5
+L(loop_small):
+#ifdef __LITTLE_ENDIAN__
+	ldx	r12,0,r8
+#else
+	ldbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmpld	r8,r0
+	cmpldi	cr7,r3,0
+	bne	cr7,L(done)
+	addi	r8,r8,-8
+	bne	L(loop_small)
+	blr
+
+END (MEMRCHR)
+weak_alias (__memrchr, memrchr)
+libc_hidden_builtin_def (memrchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memset.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memset.S
new file mode 100644
index 0000000000..21933c0672
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -0,0 +1,399 @@
+/* Optimized memset implementation for PowerPC64/POWER7.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+	.machine power7
+EALIGN (MEMSET, 5, 0)
+	CALL_MCOUNT 3
+
+L(_memset):
+	cmpldi	cr7,5,31
+	cmpldi	cr6,5,8
+	mr	10,3
+
+	/* Replicate byte to word.  */
+	insrdi	4,4,8,48
+	insrdi	4,4,16,32
+	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
+
+	neg	0,3
+	ble	cr7,L(medium)	/* If length < 32, use medium copy code.  */
+
+	andi.	11,10,7		/* Check alignment of SRC.  */
+	insrdi	4,4,32,0	/* Replicate word to double word.  */
+
+	mr	12,5
+	beq	L(big_aligned)
+
+	clrldi	0,0,61
+	mtocrf	0x01,0
+	subf	5,0,5
+
+	/* Get DST aligned to 8 bytes.  */
+1:	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	bf	30,4f
+
+	sth	4,0(10)
+	addi	10,10,2
+4:	bf	29,L(big_aligned)
+
+	stw	4,0(10)
+	addi	10,10,4
+
+	.align	4
+L(big_aligned):
+
+	cmpldi	cr5,5,255
+	li	0,32
+	dcbtst	0,10
+	cmpldi	cr6,4,0
+	srdi	9,5,3	/* Number of full doublewords remaining.  */
+	crand	27,26,21
+	mtocrf	0x01,9
+	bt	27,L(huge)
+
+	/* From this point on, we'll copy 32+ bytes and the value
+	   isn't 0 (so we can't use dcbz).  */
+
+	srdi	8,5,5
+	clrldi	11,5,61
+	cmpldi	cr6,11,0
+	cmpldi	cr1,9,4
+	mtctr	8
+
+	/* Copy 1~3 doublewords so the main loop starts
+	at a multiple of 32 bytes.  */
+
+	bf	30,1f
+
+	std	4,0(10)
+	std	4,8(10)
+	addi	10,10,16
+	bf	31,L(big_loop)
+
+	std	4,0(10)
+	addi	10,10,8
+	mr	12,10
+	blt	cr1,L(tail_bytes)
+	b	L(big_loop)
+
+	.align	4
+1:	/* Copy 1 doubleword.  */
+	bf	31,L(big_loop)
+
+	std	4,0(10)
+	addi	10,10,8
+
+	/* Main aligned copy loop.  Copies 32-bytes at a time and
+	   ping-pong through r10 and r12 to avoid AGEN delays.  */
+	.align	4
+L(big_loop):
+	addi	12,10,32
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	bdz	L(tail_bytes)
+
+	addi	10,10,64
+	std	4,0(12)
+	std	4,8(12)
+	std	4,16(12)
+	std	4,24(12)
+	bdnz	L(big_loop)
+
+	mr	12,10
+	b	L(tail_bytes)
+
+	.align	4
+L(tail_bytes):
+
+	/* Check for tail bytes.  */
+	beqlr	cr6
+
+	clrldi	0,5,61
+	mtocrf	0x01,0
+
+	/*  At this point we have a tail of 0-7 bytes and we know that the
+	destination is doubleword-aligned.  */
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(12)
+	addi	12,12,4
+2:	/* Copy 2 bytes.  */
+	bf	30,1f
+
+	sth	4,0(12)
+	addi	12,12,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(12)
+	blr
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out 128-bytes at a time.  Before using
+	   dcbz though, we need to get the destination 128-bytes aligned.  */
+	.align	4
+L(huge):
+	andi.	11,10,127
+	neg	0,10
+	beq	L(huge_aligned)
+
+	clrldi	0,0,57
+	subf	5,0,5
+	srdi	0,0,3
+	mtocrf	0x01,0
+
+	/* Get DST aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	std	4,32(10)
+	std	4,40(10)
+	std	4,48(10)
+	std	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	std	4,0(10)
+	std	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(huge_aligned)
+
+	std	4,0(10)
+	addi	10,10,8
+
+
+L(huge_aligned):
+	srdi	8,5,7
+	clrldi	11,5,57
+	cmpldi	cr6,11,0
+	mtctr	8
+
+	.align	4
+L(huge_loop):
+	dcbz	0,10
+	addi	10,10,128
+	bdnz	L(huge_loop)
+
+	/* Check how many bytes are still left.  */
+	beqlr	cr6
+
+	subf	9,3,10
+	subf	5,9,12
+	srdi	8,5,3
+	cmpldi	cr6,8,0
+	mtocrf	0x01,8
+
+	/* We have a tail o 1~127 bytes.  Copy up to 15 doublewords for
+	speed.  We'll handle the resulting tail bytes later.  */
+	beq	cr6,L(tail)
+
+8:	bf	28,4f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	std	4,32(10)
+	std	4,40(10)
+	std	4,48(10)
+	std	4,56(10)
+	addi	10,10,64
+	.align	4
+4:	bf	29,2f
+
+	std	4,0(10)
+	std	4,8(10)
+	std	4,16(10)
+	std	4,24(10)
+	addi	10,10,32
+	.align	4
+2:	bf	30,1f
+
+	std	4,0(10)
+	std	4,8(10)
+	addi	10,10,16
+	.align	4
+1:	bf	31,L(tail)
+
+	std	4,0(10)
+	addi	10,10,8
+
+	/* Handle the rest of the tail bytes here.  */
+L(tail):
+	mtocrf	0x01,5
+
+	.align	4
+4:	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+	.align	4
+2:	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+	.align	4
+1:	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Expanded tree to copy tail bytes without increments.  */
+	.align	4
+L(copy_tail):
+	bf	29,L(FXX)
+
+	stw	4,0(10)
+	bf	30,L(TFX)
+
+	sth	4,4(10)
+	bflr	31
+
+	stb	4,6(10)
+	blr
+
+	.align	4
+L(FXX):	bf	30,L(FFX)
+
+	sth	4,0(10)
+	bflr	31
+
+	stb	4,2(10)
+	blr
+
+	.align	4
+L(TFX):	bflr	31
+
+	stb	4,4(10)
+	blr
+
+	.align	4
+L(FFX):	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handle copies of 9~31 bytes.  */
+	.align	4
+L(medium):
+	/* At least 9 bytes to go.  */
+	andi.	11,10,3
+	clrldi	0,0,62
+	beq	L(medium_aligned)
+
+	/* Force 4-bytes alignment for DST.  */
+	mtocrf	0x01,0
+	subf	5,0,5
+1:	/* Copy 1 byte.  */
+	bf	31,2f
+
+	stb	4,0(10)
+	addi	10,10,1
+2:	/* Copy 2 bytes.  */
+	bf	30,L(medium_aligned)
+
+	sth	4,0(10)
+	addi	10,10,2
+
+	.align	4
+L(medium_aligned):
+	/* At least 6 bytes to go, and DST is word-aligned.  */
+	cmpldi	cr1,5,16
+	mtocrf	0x01,5
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	stw	4,0(10)
+	stw	4,4(10)
+	stw	4,8(10)
+	stw	4,12(10)
+	addi	10,10,16
+8:	/* Copy 8 bytes.  */
+	bf	28,4f
+
+	stw	4,0(10)
+	stw	4,4(10)
+	addi	10,10,8
+4:	/* Copy 4 bytes.  */
+	bf	29,2f
+
+	stw	4,0(10)
+	addi	10,10,4
+2:	/* Copy 2-3 bytes.  */
+	bf	30,1f
+
+	sth	4,0(10)
+	addi	10,10,2
+1:	/* Copy 1 byte.  */
+	bflr	31
+
+	stb	4,0(10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(small):
+	mtocrf	0x01,5
+	bne	cr6,L(copy_tail)
+
+	stw	4,0(10)
+	stw	4,4(10)
+	blr
+
+END_GEN_TB (MEMSET,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+   between bzero and memset.  */
+ENTRY (__bzero)
+	CALL_MCOUNT 3
+	mr	r5,r4
+	li	r4,0
+	b	L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/multiarch/Implies b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/multiarch/Implies
new file mode 100644
index 0000000000..bf5d6171a5
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/multiarch/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/power6/multiarch
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/rawmemchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
new file mode 100644
index 0000000000..48afb75943
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/rawmemchr.S
@@ -0,0 +1,115 @@
+/* Optimized rawmemchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] rawmemchr (void *s [r3], int c [r4])  */
+
+#ifndef RAWMEMCHR
+# define RAWMEMCHR __rawmemchr
+#endif
+	.machine  power7
+ENTRY (RAWMEMCHR)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	/* Now r4 has a doubleword of c bytes.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	cmpb	r5,r12,r4     /* Compare each byte against c byte.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6      /* Move left to discard ignored bits.  */
+	srd	r5,r5,r6      /* Bring the bits back as zeros.  */
+#endif
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c bytes have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle DWORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r5,r12,r4
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the byte-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu     r11,16(r8)
+	cmpb	r5,r12,r4
+	cmpb	r6,r11,r4
+	or	r7,r5,r6
+	cmpdi	cr7,r7,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a 'c' byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The 'c' byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+	mr	r5,r6
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the 'c' byte in the original
+	   doubleword from the string.  Use that fact to find out what is
+	   the position of the byte inside the string.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0	      /* Count trailing zeros.  */
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching char.  */
+	blr
+END (RAWMEMCHR)
+weak_alias (__rawmemchr,rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/stpncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/stpncpy.S
new file mode 100644
index 0000000000..a346dd7e28
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER7.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power7/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
new file mode 100644
index 0000000000..e856b8a593
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp.S
@@ -0,0 +1,126 @@
+/* Optimized strcasecmp implementation for PowerPC64.
+   Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <locale-defines.h>
+
+/* int [r3] strcasecmp (const char *s1 [r3], const char *s2 [r4] )
+
+   or if defined USE_IN_EXTENDED_LOCALE_MODEL:
+
+   int [r3] strcasecmp_l (const char *s1 [r3], const char *s2 [r4],
+                          __locale_t loc [r5]) */
+
+#ifndef STRCMP
+# define __STRCMP __strcasecmp
+# define STRCMP   strcasecmp
+#endif
+
+ENTRY (__STRCMP)
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+	CALL_MCOUNT 2
+#else
+	CALL_MCOUNT 3
+#endif
+
+#define rRTN	r3	/* Return value */
+#define rSTR1	r5	/* 1st string */
+#define rSTR2	r4	/* 2nd string */
+#define rLOCARG	r5	/* 3rd argument: locale_t */
+#define rCHAR1	r6	/* Byte read from 1st string */
+#define rCHAR2	r7	/* Byte read from 2nd string */
+#define rADDR1	r8	/* Address of tolower(rCHAR1) */
+#define rADDR2	r12	/* Address of tolower(rCHAR2) */
+#define rLWR1	r8	/* Word tolower(rCHAR1) */
+#define rLWR2	r12	/* Word tolower(rCHAR2) */
+#define rTMP	r9
+#define rLOC	r11	/* Default locale address */
+
+	cmpd	cr7, r3, r4
+#ifndef USE_IN_EXTENDED_LOCALE_MODEL
+	ld 	rTMP, __libc_tsd_LOCALE@got@tprel(r2)
+	add 	rLOC, rTMP, __libc_tsd_LOCALE@tls
+	ld	rLOC, 0(rLOC)
+#else
+	mr	rLOC, rLOCARG
+#endif
+	ld	rLOC, LOCALE_CTYPE_TOLOWER(rLOC)
+	mr	rSTR1, rRTN
+	li	rRTN, 0
+	beqlr	cr7
+
+
+	/* Unrolling loop for POWER: loads are done with 'lbz' plus
+	offset and string descriptors are only updated in the end
+	of loop unrolling. */
+
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+L(loop):
+	cmpdi	rCHAR1, 0		/* *s1 == '\0' ? */
+	sldi	rADDR1, rCHAR1, 2	/* Calculate address for tolower(*s1) */
+	sldi	rADDR2, rCHAR2, 2	/* Calculate address for tolower(*s2) */
+	lwzx	rLWR1, rLOC, rADDR1	/* Load tolower(*s1) */
+	lwzx	rLWR2, rLOC, rADDR2	/* Load tolower(*s2) */
+	cmpw	cr1, rLWR1, rLWR2	/* r = tolower(*s1) == tolower(*s2) ? */
+	crorc	4*cr1+eq,eq,4*cr1+eq	/* (*s1 != '\0') || (r == 1) */
+	beq	cr1, L(done)
+	lbz	rCHAR1, 1(rSTR1)
+	lbz	rCHAR2, 1(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 2(rSTR1)
+	lbz	rCHAR2, 2(rSTR2)
+	cmpdi	rCHAR1, 0
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq	cr1, L(done)
+	lbz	rCHAR1, 3(rSTR1)
+	lbz	rCHAR2, 3(rSTR2)
+	cmpdi	rCHAR1, 0
+	/* Increment both string descriptors */
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+	sldi	rADDR1, rCHAR1, 2
+	sldi	rADDR2, rCHAR2, 2
+	lwzx	rLWR1, rLOC, rADDR1
+	lwzx	rLWR2, rLOC, rADDR2
+	cmpw	cr1, rLWR1, rLWR2
+	crorc	4*cr1+eq,eq,4*cr1+eq
+	beq     cr1,L(done)
+	lbz	rCHAR1, 0(rSTR1)	/* Load char from s1 */
+	lbz	rCHAR2, 0(rSTR2)	/* Load char from s2 */
+	b	L(loop)
+L(done):
+	subf	r0, rLWR2, rLWR1
+	extsw	rRTN, r0
+	blr
+END (__STRCMP)
+
+weak_alias (__STRCMP, STRCMP)
+libc_hidden_builtin_def (__STRCMP)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
new file mode 100644
index 0000000000..c13c4ebcb8
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcasecmp_l.S
@@ -0,0 +1,5 @@
+#define USE_IN_EXTENDED_LOCALE_MODEL
+#define STRCMP   strcasecmp_l
+#define __STRCMP __strcasecmp_l
+
+#include "strcasecmp.S"
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchr.S
new file mode 100644
index 0000000000..a18e2e101c
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchr.S
@@ -0,0 +1,230 @@
+/* Optimized strchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRCHR
+# define STRCHR strchr
+#endif
+
+/* int [r3] strchr (char *s [r3], int c [r4])  */
+	.machine  power7
+ENTRY (STRCHR)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi  r4,r4,32,0
+
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb    r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r9,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r9,r4
+	cmpb	r7,r9,r0
+	or	r12,r10,r11
+	or	r9,r6,r7
+	or	r5,r12,r9
+	cmpdi	cr7,r5,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+
+	cmpdi	cr6,r12,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r3,r10,-1
+	andc    r3,r3,r10
+	popcntd	r0,r3
+	addi    r4,r11,-1
+	andc    r4,r4,r11
+	cmpld	cr7,r3,r4
+	bgt	cr7,L(no_match)
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before c matches.  */
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	blr
+
+	.align	4
+L(no_match):
+	li	r3,0
+	blr
+
+/* We are here because strchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb    r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu     r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(loop_null)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+END (STRCHR)
+weak_alias (strchr, index)
+libc_hidden_builtin_def (strchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchrnul.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchrnul.S
new file mode 100644
index 0000000000..27bc1f0682
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strchrnul.S
@@ -0,0 +1,131 @@
+/* Optimized strchrnul implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRCHRNUL
+# define STRCHRNUL __strchrnul
+#endif
+/* int [r3] strchrnul (char *s [r3], int c [r4])  */
+	.machine  power7
+ENTRY (STRCHRNUL)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r0    /* Compare each byte against c byte.  */
+	cmpb	r9,r12,r4     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and to bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r9,r9,r6
+	sld	r10,r10,r6
+	sld	r9,r9,r6
+#else
+	sld	r10,r10,r6
+	sld	r9,r9,r6
+	srd	r10,r10,r6
+	srd	r9,r9,r6
+#endif
+	or	r5,r9,r10     /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle DWORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r0
+	cmpb	r9,r12,r4
+	or	r5,r9,r10
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu     r11,16(r8)
+	cmpb	r10,r12,r0
+	cmpb	r9,r12,r4
+	cmpb	r6,r11,r0
+	cmpb	r7,r11,r4
+	or	r5,r9,r10
+	or	r10,r6,r7
+	or	r11,r5,r10
+	cmpdi	cr7,r11,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r5 so we can calculate
+	   the pointer.  */
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert leading zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of matching c/null byte.  */
+	blr
+END (STRCHRNUL)
+weak_alias (STRCHRNUL, strchrnul)
+libc_hidden_builtin_def (STRCHRNUL)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcmp.S
new file mode 100644
index 0000000000..14e14f457e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strcmp.S
@@ -0,0 +1,168 @@
+/* Optimized strcmp implementation for Power7 using 'cmpb' instruction
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The optimization is achieved here through cmpb instruction.
+   8byte aligned strings are processed with double word comparision
+   and unaligned strings are handled effectively with loop unrolling
+   technique  */
+
+#include <sysdep.h>
+
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
+/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])  */
+
+	.machine	power7
+EALIGN (STRCMP, 4, 0)
+	CALL_MCOUNT 2
+
+	or r9, r3, r4
+	rldicl. r10, r9, 0, 61	/* are s1 and s2 8 byte aligned..?  */
+	bne cr0, L(process_unaligned_bytes)
+	li	r5, 0
+
+	.align 4
+/* process input parameters on double word aligned boundary  */
+L(unrollDword):
+	ld	r8,0(r3)
+	ld	r10,0(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,8(r3)
+	ld	r10,8(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,16(r3)
+	ld	r10,16(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,24(r3)
+	ld	r10,24(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	addi r3, r3, 32
+	addi r4, r4, 32
+	beq cr7, L(unrollDword)
+
+	.align 4
+L(null_found):
+#ifdef __LITTLE_ENDIAN__
+	neg	r7,r9
+	and	r9,r9,r7
+	li	r7,-1
+	cntlzd	r9,r9
+	subfic	r9,r9,71
+	sld	r9,r7,r9
+#else
+	cntlzd	r9,r9
+	li	r7,-1
+	addi	r9,r9,8
+	srd	r9,r7,r9
+#endif
+	or	r8,r8,r9
+	or	r10,r10,r9
+
+L(different):
+	cmpb	r9,r8,r10
+#ifdef __LITTLE_ENDIAN__
+	addi	r7,r9,1
+	andc	r9,r7,r9
+	cntlzd	r9,r9
+	subfic	r9,r9,63
+#else
+	not	r9,r9
+	cntlzd	r9,r9
+	subfic	r9,r9,56
+#endif
+	srd	r3,r8,r9
+	srd	r10,r10,r9
+	rldicl	r10,r10,0,56
+	rldicl	r3,r3,0,56
+	subf	r3,r10,r3
+	blr
+
+	.align 4
+L(process_unaligned_bytes):
+	lbz r9, 0(r3)		/* load byte from s1  */
+	lbz r10, 0(r4)		/* load byte from s2  */
+	cmpdi cr7, r9, 0	/* compare *s1 with NULL  */
+	beq cr7, L(diffOfNULL)	/* if *s1 is NULL , return *s1 - *s2  */
+	cmplw cr7, r9, r10	/* compare *s1 and *s2  */
+	bne cr7, L(ComputeDiff)	/* branch to compute difference and return  */
+
+	lbz r9, 1(r3)		/* load next byte from s1  */
+	lbz r10, 1(r4)		/* load next byte from s2  */
+	cmpdi cr7, r9, 0	/* compare *s1 with NULL  */
+	beq cr7, L(diffOfNULL)	/* if *s1 is NULL , return *s1 - *s2  */
+	cmplw cr7, r9, r10	/* compare *s1 and *s2  */
+	bne cr7, L(ComputeDiff)	/* branch to compute difference and return  */
+
+	lbz r9, 2(r3)		/* unroll 3rd byte here  */
+	lbz r10, 2(r4)
+	cmpdi cr7, r9, 0
+	beq cr7, L(diffOfNULL)
+	cmplw cr7, r9, r10
+	bne 7, L(ComputeDiff)
+
+	lbz r9, 3(r3)		/* unroll 4th byte now  */
+	lbz r10, 3(r4)
+	addi r3, r3, 4		/* increment s1 by unroll factor  */
+	cmpdi cr7, r9, 0
+	cmplw cr6, 9, r10
+	beq cr7, L(diffOfNULL)
+	addi r4, r4, 4		/* increment s2 by unroll factor  */
+	beq cr6, L(process_unaligned_bytes)	/* unroll byte processing  */
+
+	.align 4
+L(ComputeDiff):
+	extsw r9, r9
+	subf r10, r10, r9	/* compute s1 - s2  */
+	extsw r3, r10
+	blr			/* return  */
+
+	.align 4
+L(diffOfNULL):
+	li r9, 0
+	subf r10, r10, r9	/* compute s1 - s2  */
+	extsw r3, r10		/* sign extend result  */
+	blr			/* return  */
+
+END (STRCMP)
+libc_hidden_builtin_def (strcmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strlen.S
new file mode 100644
index 0000000000..63848c460c
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strlen.S
@@ -0,0 +1,107 @@
+/* Optimized strlen implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] strlen (char *s [r3])  */
+
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+	.machine  power7
+ENTRY (STRLEN)
+	CALL_MCOUNT 1
+	dcbt	0,r3
+	clrrdi	r4,r3,3	      /* Align the address to doubleword boundary.  */
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
+	ld	r12,0(r4)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r5,r5,r6
+#else
+	srd	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
+	cmpb	r10,r9,r0     /* Check for null bytes in DWORD1.  */
+	cmpdi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+	bne	cr7,L(done)
+
+	mtcrf   0x01,r4
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle DWORD2 of pair.  */
+	ldu	r12,8(r4)
+	cmpb	r10,r12,r0
+	cmpdi	cr7,r10,0
+	bne	cr7,L(done)
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+
+	ld	r12, 8(r4)
+	ldu	r11, 16(r4)
+	cmpb	r10,r12,r0
+	cmpb	r9,r11,r0
+	or	r8,r9,r10     /* Merge everything in one doubleword.  */
+	cmpdi	cr7,r8,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r10,0
+	addi	r4,r4,-8
+	bne	cr6,L(done)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   length.  */
+
+	mr	r10,r9
+	addi	r4,r4,8
+
+	/* r10 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the length.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10, -1   /* Form a mask from trailing zeros.  */
+	andc	r9, r9, r10
+	popcntd r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	subf	r5,r3,r4
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r5,r0      /* Compute final length.  */
+	blr
+END (STRLEN)
+libc_hidden_builtin_def (strlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncmp.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncmp.S
new file mode 100644
index 0000000000..d53b31be8e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncmp.S
@@ -0,0 +1,227 @@
+/* Optimized strcmp implementation for POWER7/PowerPC64.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRNCMP
+# define STRNCMP strncmp
+#endif
+
+/* See strlen.s for comments on how the end-of-string testing works.  */
+
+/* int [r3] strncmp (const char *s1 [r3],
+		     const char *s2 [r4],
+		     size_t size [r5])  */
+
+EALIGN (STRNCMP,5,0)
+	CALL_MCOUNT 3
+
+#define rTMP2	r0
+#define rRTN	r3
+#define rSTR1	r3	/* first string arg */
+#define rSTR2	r4	/* second string arg */
+#define rN	r5	/* max string length */
+#define rWORD1	r6	/* current word in s1 */
+#define rWORD2	r7	/* current word in s2 */
+#define rWORD3  r10
+#define rWORD4  r11
+#define rFEFE	r8	/* constant 0xfefefefefefefeff (-0x0101010101010101) */
+#define r7F7F	r9	/* constant 0x7f7f7f7f7f7f7f7f */
+#define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f7f7f7f7f) */
+#define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
+
+	dcbt	0,rSTR1
+	nop
+	or	rTMP,rSTR2,rSTR1
+	lis	r7F7F,0x7f7f
+	dcbt	0,rSTR2
+	nop
+	clrldi.	rTMP,rTMP,61
+	cmpldi	cr1,rN,0
+	lis	rFEFE,-0x101
+	bne	L(unaligned)
+/* We are doubleword aligned so set up for two loops.  first a double word
+   loop, then fall into the byte loop if any residual.  */
+	srdi.	rTMP,rN,3
+	clrldi	rN,rN,61
+	addi	rFEFE,rFEFE,-0x101
+	addi	r7F7F,r7F7F,0x7f7f
+	cmpldi	cr1,rN,0
+	beq	L(unaligned)
+
+	mtctr	rTMP
+	ld	rWORD1,0(rSTR1)
+	ld	rWORD2,0(rSTR2)
+	sldi	rTMP,rFEFE,32
+	insrdi	r7F7F,r7F7F,32,0
+	add	rFEFE,rFEFE,rTMP
+	b	L(g1)
+
+L(g0):
+	ldu	rWORD1,8(rSTR1)
+	bne	cr1,L(different)
+	ldu	rWORD2,8(rSTR2)
+L(g1):	add	rTMP,rFEFE,rWORD1
+	nor	rNEG,r7F7F,rWORD1
+	bdz	L(tail)
+	and.	rTMP,rTMP,rNEG
+	cmpd	cr1,rWORD1,rWORD2
+	beq	L(g0)
+
+/* OK. We've hit the end of the string. We need to be careful that
+   we don't compare two strings as different because of gunk beyond
+   the end of the strings...  */
+
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	beq	cr1, L(equal)
+	andc    rTMP2, rTMP2, rTMP
+	rldimi	rTMP2, rTMP2, 1, 0
+	and	rWORD2, rWORD2, rTMP2	/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	cmpd	cr1, rWORD1, rWORD2
+	beq	cr1, L(equal)
+	cmpb	rBITDIF, rWORD1, rWORD2	/* 0xff on equal bytes.  */
+	addi	rNEG, rBITDIF, 1
+	orc	rNEG, rNEG, rBITDIF	/* 0's below LS differing byte.  */
+	sldi	rNEG, rNEG, 8		/* 1's above LS differing byte.  */
+	andc	rWORD1, rWORD1, rNEG	/* mask off MS bytes.  */
+	andc	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt	L(highbit)
+	sradi	rRTN, rRTN, 63		/* must return an int.  */
+	ori	rRTN, rRTN, 1
+	blr
+L(equal):
+	li	rRTN, 0
+	blr
+
+L(different):
+	ld	rWORD1, -8(rSTR1)
+	cmpb	rBITDIF, rWORD1, rWORD2	/* 0xff on equal bytes.  */
+	addi	rNEG, rBITDIF, 1
+	orc	rNEG, rNEG, rBITDIF	/* 0's below LS differing byte.  */
+	sldi	rNEG, rNEG, 8		/* 1's above LS differing byte.  */
+	andc	rWORD1, rWORD1, rNEG	/* mask off MS bytes.  */
+	andc	rWORD2, rWORD2, rNEG
+	xor.	rBITDIF, rWORD1, rWORD2
+	sub	rRTN, rWORD1, rWORD2
+	blt	L(highbit)
+	sradi	rRTN, rRTN, 63
+	ori	rRTN, rRTN, 1
+	blr
+L(highbit):
+	sradi	rRTN, rWORD2, 63
+	ori	rRTN, rRTN, 1
+	blr
+
+#else
+L(endstring):
+	and	rTMP,r7F7F,rWORD1
+	beq	cr1,L(equal)
+	add	rTMP,rTMP,r7F7F
+	xor.	rBITDIF,rWORD1,rWORD2
+	andc	rNEG,rNEG,rTMP
+	blt	L(highbit)
+	cntlzd	rBITDIF,rBITDIF
+	cntlzd	rNEG,rNEG
+	addi	rNEG,rNEG,7
+	cmpd	cr1,rNEG,rBITDIF
+	sub	rRTN,rWORD1,rWORD2
+	blt	cr1,L(equal)
+	sradi	rRTN,rRTN,63		/* must return an int.  */
+	ori	rRTN,rRTN,1
+	blr
+L(equal):
+	li	rRTN,0
+	blr
+
+L(different):
+	ld	rWORD1,-8(rSTR1)
+	xor.	rBITDIF,rWORD1,rWORD2
+	sub	rRTN,rWORD1,rWORD2
+	blt	L(highbit)
+	sradi	rRTN,rRTN,63
+	ori	rRTN,rRTN,1
+	blr
+L(highbit):
+	sradi	rRTN,rWORD2,63
+	ori	rRTN,rRTN,1
+	blr
+#endif
+
+/* Oh well.  In this case, we just do a byte-by-byte comparison.  */
+	.align	4
+L(tail):
+	and.	rTMP,rTMP,rNEG
+	cmpd	cr1,rWORD1,rWORD2
+	bne	L(endstring)
+	addi	rSTR1,rSTR1,8
+	bne	cr1,L(different)
+	addi	rSTR2,rSTR2,8
+	cmpldi	cr1,rN,0
+L(unaligned):
+	mtctr	rN
+	ble	cr1,L(ux)
+L(uz):
+	lbz	rWORD1,0(rSTR1)
+	lbz	rWORD2,0(rSTR2)
+	.align	4
+L(u1):
+	cmpdi	cr1,rWORD1,0
+	bdz	L(u4)
+	cmpd	rWORD1,rWORD2
+	beq	cr1,L(u4)
+	bne	L(u4)
+	lbzu	rWORD3,1(rSTR1)
+	lbzu	rWORD4,1(rSTR2)
+	cmpdi	cr1,rWORD3,0
+	bdz	L(u3)
+	cmpd	rWORD3,rWORD4
+	beq	cr1,L(u3)
+	bne	L(u3)
+	lbzu	rWORD1,1(rSTR1)
+	lbzu	rWORD2,1(rSTR2)
+	cmpdi	cr1,rWORD1,0
+	bdz	L(u4)
+	cmpd	rWORD1,rWORD2
+	beq	cr1,L(u4)
+	bne	L(u4)
+	lbzu	rWORD3,1(rSTR1)
+	lbzu	rWORD4,1(rSTR2)
+	cmpdi	cr1,rWORD3,0
+	bdz	L(u3)
+	cmpd	rWORD3,rWORD4
+	beq	cr1,L(u3)
+	bne	L(u3)
+	lbzu	rWORD1,1(rSTR1)
+	lbzu	rWORD2,1(rSTR2)
+	b	L(u1)
+
+L(u3):  sub	rRTN,rWORD3,rWORD4
+	blr
+L(u4):	sub	rRTN,rWORD1,rWORD2
+	blr
+L(ux):
+	li	rRTN,0
+	blr
+END (STRNCMP)
+libc_hidden_builtin_def (strncmp)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S
new file mode 100644
index 0000000000..0224f74898
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strncpy.S
@@ -0,0 +1,722 @@
+/* Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the functions
+
+   char * [r3] strncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+   AND
+
+   char * [r3] stpncpy (char *dst [r3], const char *src [r4], size_t n [r5])
+
+   The algorithm is as follows:
+   > if src and dest are 8 byte aligned, perform double word copy
+     else
+   > copy byte by byte on unaligned addresses.
+
+   The aligned comparison are made using cmpb instructions.  */
+
+/* The focus on optimization for performance improvements are as follows:
+   1. data alignment [gain from aligned memory access on read/write]
+   2. POWER7 gains performance with loop unrolling/unwinding
+      [gain by reduction of branch penalty].
+   3. The final pad with null bytes is done by calling an optimized
+      memset.  */
+
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+#  define FUNC_NAME __stpncpy
+# else
+#  define FUNC_NAME STPNCPY
+# endif
+#else
+# ifndef STRNCPY
+#  define FUNC_NAME strncpy
+# else
+#  define FUNC_NAME STRNCPY
+# endif
+#endif  /* !USE_AS_STPNCPY  */
+
+#define		FRAMESIZE	(FRAME_MIN_SIZE+32)
+
+#ifndef MEMSET
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define MEMSET   __GI_memset
+# else
+#  define MEMSET   memset
+# endif
+#endif
+
+	.machine  power7
+EALIGN(FUNC_NAME, 4, 0)
+	CALL_MCOUNT 3
+
+	mflr r0			/* load link register LR to r0  */
+	or r10, r3, r4		/* to verify source and destination  */
+	rldicl. r8, r10, 0, 61	/* is double word aligned .. ?  */
+
+	std r19, -8(r1)		/* save callers register , r19  */
+	std r18, -16(r1)	/* save callers register , r18  */
+	std r0, 16(r1)		/* store the link register  */
+	stdu r1, -FRAMESIZE(r1)	/* create the stack frame  */
+
+	mr r9, r3		/* save r3 into r9 for use  */
+	mr r18, r3		/* save r3 for retCode of strncpy  */
+	bne 0, L(unaligned)
+
+L(aligned):
+	srdi r11, r5, 3		/* compute count for CTR ; count = n/8  */
+	cmpldi cr7, r11, 3	/* if count > 4 ; perform unrolling 4 times  */
+	ble 7, L(update1)
+
+	ld r10, 0(r4)		/* load doubleWord from src  */
+	cmpb r8, r10, r8	/* compare src with NULL ,we read just now  */
+	cmpdi cr7, r8, 0	/* if cmpb returned NULL ; we continue  */
+	bne cr7, L(update3)
+
+	std r10, 0(r3)		/* copy doubleword at offset=0  */
+	ld r10, 8(r4)		/* load next doubleword from offset=8  */
+	cmpb r8, r10, r8	/* compare src with NULL , we read just now  */
+	cmpdi cr7, r8, 0	/* if cmpb returned NULL ; we continue  */
+	bne 7,L(HopBy8)
+
+	addi r8, r11, -4
+	mr r7, r3
+	srdi r8, r8, 2
+	mr r6, r4
+	addi r8, r8, 1
+	li r12, 0
+	mtctr r8
+	b L(dwordCopy)
+
+	.p2align 4
+L(dWordUnroll):
+	std r8, 16(r9)
+	ld r8, 24(r4)		/* load dword,perform loop unrolling again  */
+	cmpb r10, r8, r10
+	cmpdi cr7, r10, 0
+	bne cr7, L(HopBy24)
+
+	std r8, 24(r7)		/* copy dword at offset=24  */
+	addi r9, r9, 32
+	addi r4, r4, 32
+	bdz  L(leftDwords)	/* continue with loop on counter  */
+
+	ld r3, 32(r6)
+	cmpb r8, r3, r10
+	cmpdi cr7, r8, 0
+	bne cr7, L(update2)
+
+	std r3, 32(r7)
+	ld r10, 40(r6)
+	cmpb r8, r10, r8
+	cmpdi cr7, r8, 0
+	bne cr7, L(HopBy40)
+
+	mr r6, r4		/* update values  */
+	mr r7, r9
+	mr r11, r0
+	mr r5, r19
+
+L(dwordCopy):
+	std r10, 8(r9)		/* copy dword at offset=8  */
+	addi r19, r5, -32
+	addi r0, r11, -4
+	ld r8, 16(r4)
+	cmpb r10, r8, r12
+	cmpdi cr7, r10, 0
+	beq cr7, L(dWordUnroll)
+
+	addi r9, r9, 16		/* increment dst by 16  */
+	addi r4, r4, 16		/* increment src by 16  */
+	addi r5, r5, -16	/* decrement length 'n' by 16  */
+	addi r0, r11, -2	/* decrement loop counter  */
+
+L(dWordUnrollOFF):
+	ld r10, 0(r4)		/* load first dword  */
+	li r8, 0		/* load mask  */
+	cmpb r8, r10, r8
+	cmpdi cr7, r8, 0
+	bne cr7, L(byte_by_byte)
+	mtctr r0
+	li r7, 0
+	b L(CopyDword)
+
+	.p2align 4
+L(loadDWordandCompare):
+	ld r10, 0(r4)
+	cmpb r8, r10, r7
+	cmpdi cr7, r8, 0
+	bne cr7, L(byte_by_byte)
+
+L(CopyDword):
+	addi r9, r9, 8
+	std r10, -8(r9)
+	addi r4, r4, 8
+	addi r5, r5, -8
+	bdnz L(loadDWordandCompare)
+
+L(byte_by_byte):
+	cmpldi cr7, r5, 3
+	ble cr7, L(verifyByte)
+	srdi r10, r5, 2
+	mr r19, r9
+	mtctr r10
+	b L(firstByteUnroll)
+
+	.p2align 4
+L(bytes_unroll):
+	lbz r10, 1(r4)		/* load byte from src  */
+	cmpdi cr7, r10, 0	/* compare for NULL  */
+	stb r10, 1(r19)		/* store byte to dst  */
+	beq cr7, L(updtDestComputeN2ndByte)
+
+	addi r4, r4, 4		/* advance src  */
+
+	lbz r10, -2(r4)		/* perform loop unrolling for byte r/w  */
+	cmpdi cr7, r10, 0
+	stb r10, 2(r19)
+	beq cr7, L(updtDestComputeN3rdByte)
+
+	lbz r10, -1(r4)		/* perform loop unrolling for byte r/w  */
+	addi r19, r19, 4
+	cmpdi cr7, r10, 0
+	stb r10, -1(r19)
+	beq cr7, L(ComputeNByte)
+
+	bdz L(update0)
+
+L(firstByteUnroll):
+	lbz r10, 0(r4)		/* perform loop unrolling for byte r/w  */
+	cmpdi cr7, 10, 0
+	stb r10, 0(r19)
+	bne cr7, L(bytes_unroll)
+	addi r19, r19, 1
+
+L(ComputeNByte):
+	subf r9, r19, r9	/* compute 'n'n bytes to fill  */
+	add r8, r9, r5
+
+L(zeroFill):
+	cmpdi cr7, r8, 0	/* compare if length is zero  */
+	beq cr7, L(update3return)
+
+	mr r3, r19		/* fill buffer with  */
+	li r4, 0		/* zero fill buffer  */
+	mr r5, r8		/* how many bytes to fill buffer with  */
+	bl MEMSET		/* call optimized memset  */
+	nop
+
+L(update3return):
+#ifdef USE_AS_STPNCPY
+	addi r3, r19, -1	/* update return value  */
+#endif
+
+L(hop2return):
+#ifndef USE_AS_STPNCPY
+	mr r3, r18		/* set return value  */
+#endif
+	addi r1, r1, FRAMESIZE	/* restore stack pointer  */
+	ld r0, 16(r1)		/* read the saved link register  */
+	ld r18, -16(r1)		/* restore callers save register, r18  */
+	ld r19, -8(r1)		/* restore callers save register, r19  */
+	mtlr r0			/* branch to link register  */
+	blr			/* return  */
+
+	.p2align 4
+L(update0):
+	mr r9, r19
+
+	.p2align 4
+L(verifyByte):
+	rldicl. r8, r5, 0, 62
+#ifdef USE_AS_STPNCPY
+	mr r3, r9
+#endif
+	beq cr0, L(hop2return)
+	mtctr r8
+	addi r4, r4, -1
+	mr r19, r9
+	b L(oneBYone)
+
+	.p2align 4
+L(proceed):
+	bdz L(done)
+
+L(oneBYone):
+	lbzu r10, 1(r4)		/* copy byte  */
+	addi r19, r19, 1
+	addi r8, r8, -1
+	cmpdi cr7, r10, 0
+	stb r10, -1(r19)
+	bne cr7, L(proceed)
+	b L(zeroFill)
+
+	.p2align 4
+L(done):
+	addi r1, r1, FRAMESIZE	/* restore stack pointer  */
+#ifdef USE_AS_STPNCPY
+	mr r3, r19		/* set the return value  */
+#else
+	mr r3, r18		/* set the return value  */
+#endif
+	ld r0, 16(r1)		/* read the saved link register  */
+	ld r18, -16(r1)		/* restore callers save register, r18  */
+	ld r19, -8(r1)		/* restore callers save register, r19  */
+	mtlr r0			/* branch to link register  */
+	blr			/* return  */
+
+L(update1):
+	mr r0, r11
+	mr r19, r5
+
+	.p2align 4
+L(leftDwords):
+	cmpdi cr7, r0, 0
+	mr r5, r19
+	bne cr7, L(dWordUnrollOFF)
+	b L(byte_by_byte)
+
+	.p2align 4
+L(updtDestComputeN2ndByte):
+	addi r19, r19, 2	/* update dst by 2  */
+	subf r9, r19, r9	/* compute distance covered  */
+	add r8, r9, r5
+	b L(zeroFill)
+
+	.p2align 4
+L(updtDestComputeN3rdByte):
+	addi r19, r19, 3	/* update dst by 3  */
+	subf r9, r19, r9	/* compute distance covered  */
+	add r8, r9, r5
+	b L(zeroFill)
+
+	.p2align 4
+L(HopBy24):
+	addi r9, r9, 24		/* increment dst by 24  */
+	addi r4, r4, 24		/* increment src by 24  */
+	addi r5, r5, -24	/* decrement length 'n' by 24  */
+	addi r0, r11, -3	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+	.p2align 4
+L(update2):
+	mr r5, r19
+	b L(dWordUnrollOFF)
+
+	.p2align 4
+L(HopBy40):
+	addi r9, r7, 40		/* increment dst by 40  */
+	addi r4, r6, 40		/* increment src by 40  */
+	addi r5, r5, -40	/* decrement length 'n' by 40  */
+	addi r0, r11, -5	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+L(update3):
+	mr r0, r11
+	b L(dWordUnrollOFF)
+
+L(HopBy8):
+	addi r9, r3, 8		/* increment dst by 8  */
+	addi r4, r4, 8		/* increment src by 8  */
+	addi r5, r5, -8		/* decrement length 'n' by 8  */
+	addi r0, r11, -1	/* decrement loop counter  */
+	b L(dWordUnrollOFF)
+
+L(unaligned):
+	cmpdi	r5, 16		/* Proceed byte by byte for less than 16  */
+	ble	L(byte_by_byte)
+	rldicl	r7, r3, 0, 61
+	rldicl	r6, r4, 0, 61
+	cmpdi	r6, 0	/* Check src alignment */
+	beq	L(srcaligndstunalign)
+	/* src is unaligned */
+	rlwinm	r10, r4, 3,26,28	/* Calculate padding.  */
+	clrrdi	r4, r4, 3	/* Align the addr to dw boundary */
+	ld	r8, 0(r4)	/* Load doubleword from memory.  */
+	li	r0, 0
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	srd	r7, r8, r10
+#else
+	sld	r7, r8, r10
+#endif
+	cmpb	r0, r7, r0	/* Compare each byte against null */
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	sld	r0, r0, r10
+#else
+	srd	r0, r0, r10
+#endif
+	cmpdi	r0, 0
+	bne     L(bytebybyte)	/* if it has null, copy byte by byte */
+	subfic	r6, r6, 8
+	rlwinm	r12, r3, 3,26,28	/* Calculate padding in bits.  */
+	rldicl	r9, r3, 0, 61	/* Calculate padding in bytes. */
+	addi	r3, r3, -1
+
+	cmpdi	r12, 0	/* check dest alignment */
+	beq     L(srcunaligndstalign)
+
+	/* both src and dst unaligned */
+#ifdef __LITTLE_ENDIAN__
+	sld	r8, r7, r10
+	mr	r11, r10
+	addi	r11, r11, -8	/* Adjust byte pointer on loaded dw */
+#else
+	srd	r8, r7, r10
+	subfic	r11, r10, 64
+#endif
+	/* dst alignment is greater then src alignment? */
+	cmpd    cr7, r12, r10
+	ble     cr7, L(dst_align_small)
+	/* src alignment is less than dst */
+
+	/* Calculate the dst alignment difference  */
+	subfic	r7, r9, 8
+	mtctr	r7
+
+	/* Write until dst is aligned  */
+	cmpdi	r0, r7, 4
+	blt     L(storebyte1)	/* less than 4, store byte by byte  */
+	beq     L(equal1)	/* if its 4, store word  */
+	addi	r0, r7, -4	/* greater than 4, so stb and stw  */
+	mtctr	r0
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on loaded dw  */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	r7, r8, r11
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	bdnz    L(storebyte1)
+
+	subfic	r7, r9, 8	/* Check the remaining bytes  */
+	cmpdi	r0, r7, 4
+	blt     L(proceed1)
+
+	.align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on loaded dw  */
+	srd	r7, r8, r11
+#else
+	subfic	r11, r11, 64
+	sld	r7, r8, r11
+	srdi	r7, r7, 32
+#endif
+	stw	r7, 1(r3)
+	addi	r3, r3, 4
+	addi	r5, r5, -4
+
+L(proceed1):
+	mr	r7, r8
+	/* calculate the Left over bytes to be written  */
+	subfic	r11, r10, 64
+	subfic	r12, r12, 64
+	subf	r12, r12, r11	/* remaining bytes on second dw  */
+	subfic	r10, r12, 64	/* remaining bytes on first dw  */
+	subfic	r9, r9, 8
+	subf	r6, r9, r6	/* recalculate padding  */
+L(srcunaligndstalign):
+	addi	r3, r3, 1
+	subfic	r12, r10, 64	/* remaining bytes on second dw  */
+	addi	r4, r4, 8
+	li	r0,0
+	b       L(storedouble)
+
+	.align 4
+L(dst_align_small):
+	mtctr	r6
+	/* Write until src is aligned  */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on dw  */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	r7, r8, r11
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	bdnz    L(storebyte2)
+
+	addi	r4, r4, 8	/* Increment src pointer  */
+	addi	r3, r3, 1	/* Increment dst pointer  */
+	mr	r9, r3
+	li	r8, 0
+	cmpd    cr7, r12, r10
+	beq     cr7, L(aligned)
+	rldicl	r6, r3, 0, 61	/* Recalculate padding */
+	mr	r7, r6
+
+	/* src is algined */
+L(srcaligndstunalign):
+	mr	r9, r3
+	mr	r6, r7
+	ld	r8, 0(r4)
+	subfic	r10, r7, 8
+	mr	r7, r8
+	li	r0, 0	/* Check null */
+	cmpb	r0, r8, r0
+	cmpdi	r0, 0
+	bne     L(byte_by_byte)	/* Do byte by byte if there is NULL  */
+	rlwinm	r12, r3, 3,26,28	/* Calculate padding  */
+	addi	r3, r3, -1
+	/* write byte by byte until aligned  */
+#ifdef __LITTLE_ENDIAN__
+	li	r11, -8
+#else
+	li	r11, 64
+#endif
+	mtctr	r10
+	cmpdi	r0, r10, 4
+	blt     L(storebyte)
+	beq     L(equal)
+	addi	r0, r10, -4
+	mtctr	r0
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8	/* Adjust byte pointer on  dw  */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	r7, r8, r11
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	bdnz    L(storebyte)
+
+	cmpdi	r0, r10, 4
+	blt     L(align)
+
+	.align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8
+	srd	r7, r8, r11
+#else
+	subfic	r11, r11, 64
+	sld	r7, r8, r11
+	srdi	r7, r7, 32
+#endif
+	stw	r7, 1(r3)
+	addi	r5, r5, -4
+	addi	r3, r3, 4
+L(align):
+	addi	r3, r3, 1
+	addi	r4, r4, 8	/* Increment src pointer  */
+	subfic	r10, r12, 64
+	li	r0, 0
+	/* dst addr aligned to 8 */
+L(storedouble):
+	cmpdi	r5, 8
+	ble	L(null1)
+	ld	r7, 0(r4)	/* load next dw  */
+	cmpb	r0, r7, r0
+	cmpdi	r0, 0	/* check for null on each new dw  */
+	bne     L(null)
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r8, r10	/* bytes from first dw  */
+	sld	r11, r7, r12	/* bytes from second dw  */
+#else
+	sld	r9, r8, r10
+	srd	r11, r7, r12
+#endif
+	or	r11, r9, r11	/* make as a single dw  */
+	std	r11, 0(r3)	/* store as std on aligned addr  */
+	mr	r8, r7		/* still few bytes left to be written  */
+	addi	r3, r3, 8	/* increment dst addr  */
+	addi	r4, r4, 8	/* increment src addr  */
+	addi	r5, r5, -8
+	b       L(storedouble)	/* Loop until NULL  */
+
+	.align 4
+
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(null):
+	addi	r3, r3, -1
+	mr	r10, r12
+	mtctr	r6
+#ifdef __LITTLE_ENDIAN__
+	subfic	r10, r10, 64
+	addi	r10, r10, -8
+#endif
+	cmpdi	r0, r5, 4
+	blt	L(loop)
+	cmpdi	r0, r6, 4
+	blt     L(loop)
+
+	/* we can still use stw if leftover >= 4  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+	srd	r11, r8, r10
+#else
+	subfic	r10, r10, 64
+	sld	r11, r8, r10
+	srdi	r11, r11, 32
+#endif
+	stw	r11, 1(r3)
+	addi	r5, r5, -4
+	addi	r3, r3, 4
+	cmpdi	r0, r5, 0
+	beq	L(g1)
+	cmpdi	r0, r6, 4
+	beq     L(bytebybyte1)
+	addi	r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, -8
+#else
+	subfic	r10, r10, 64
+#endif
+	addi	r0, r6, -4
+	mtctr	r0
+	/* remaining byte by byte part of first dw  */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+#else
+	addi	r10, r10, -8
+#endif
+	srd	r0, r8, r10
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	cmpdi	r0, r5, 0
+	beq	L(g1)
+	bdnz    L(loop)
+L(bytebybyte1):
+	addi	r3, r3, 1
+	/* remaining byte by byte part of second dw   */
+L(bytebybyte):
+	addi	r3, r3, -8
+	addi	r4, r4, -1
+
+#ifdef __LITTLE_ENDIAN__
+	extrdi. r0, r7, 8, 56
+	stbu	r7, 8(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 48
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 40
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 32
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 24
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 16
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 8
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi	r0, r7, 8, 0
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	b	L(g2)
+#else
+	extrdi. r0, r7, 8, 0
+	stbu	r0, 8(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 8
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 16
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 24
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 32
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 40
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	extrdi. r0, r7, 8, 48
+	stbu	r0, 1(r3)
+	addi	r5, r5, -1
+	beq	L(g2)
+	cmpdi	r5, 0
+	beq	L(g1)
+	stbu	r7, 1(r3)
+	addi	r5, r5, -1
+	b	L(g2)
+#endif
+L(g1):
+#ifdef USE_AS_STPNCPY
+	addi	r3, r3, 1
+#endif
+L(g2):
+	addi	r3, r3, 1
+	mr	r19, r3
+	mr	r8, r5
+	b	L(zeroFill)
+L(null1):
+	mr	r9, r3
+	subf	r4, r6, r4
+	b	L(byte_by_byte)
+END(FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strnlen.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strnlen.S
new file mode 100644
index 0000000000..a970b6ce30
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strnlen.S
@@ -0,0 +1,182 @@
+/* Optimized strnlen implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2010-2017 Free Software Foundation, Inc.
+   Contributed by Luis Machado <luisgpm@br.ibm.com>.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef STRNLEN
+# define STRNLEN __strnlen
+#endif
+
+/* int [r3] strnlen (char *s [r3], int size [r4])  */
+	.machine  power7
+ENTRY (STRNLEN)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3
+	add	r7,r3,r4      /* Calculate the last acceptable address.  */
+	cmpldi	r4,32
+	li	r0,0	      /* Doubleword with null chars.  */
+	addi	r7,r7,-1
+
+	/* If we have less than 33 bytes to search, skip to a faster code.  */
+	ble	L(small_range)
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	sld	r10,r10,r6
+#else
+	sld	r10,r10,r6
+	srd	r10,r10,r6
+#endif
+	cmpldi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+	bne	cr7,L(done)
+
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+	mtcrf   0x01,r8
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop_setup)
+
+	/* Handle DWORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r0
+	cmpldi	cr7,r10,0
+	bne	cr7,L(done)
+
+L(loop_setup):
+	/* The last dword we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the dword at
+	   (s + size - 1) & ~7, or r7.  The first dword read is at
+	   r8 + 8, we read 2 * cnt dwords, so the last dword read will
+	   be at r8 + 8 + 16 * cnt - 8.  Solving for cnt gives
+	   cnt = (r7 - r8) / 16  */
+	sub	r5,r7,r8
+	srdi	r6,r5,4	      /* Number of loop iterations.  */
+	mtctr	r6	      /* Setup the counter.  */
+
+	/* Main loop to look for the null byte in the string.  Since
+	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r10,r12,r0
+	cmpb	r9,r11,r0
+	or	r5,r9,r10     /* Merge everything in one doubleword.  */
+	cmpldi	cr7,r5,0
+	bne	cr7,L(found)
+	bdnz	L(loop)
+
+	/* We may have one more dword to read.  */
+	cmpld	cr6,r8,r7
+	beq	cr6,L(end_max)
+
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r0
+	cmpldi	cr6,r10,0
+	bne	cr6,L(done)
+
+L(end_max):
+	mr	r3,r4
+	blr
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+	.align	4
+L(found):
+	cmpldi	cr6,r10,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   length.  */
+
+	mr	r10,r9
+	addi	r8,r8,8
+
+	/* r10 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the length.
+	   We need to make sure the null char is *before* the end of the
+	   range.  */
+L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r10,-1
+	andc	r0,r0,r10
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	sub	r3,r8,r3
+	srdi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r3,r0      /* Length until the match.  */
+	cmpld	r3,r4
+	blelr
+	mr	r3,r4
+	blr
+
+/* Deals with size <= 32.  */
+	.align	4
+L(small_range):
+	cmpldi	r4,0
+	beq	L(end_max)
+
+	clrrdi	r7,r7,3       /* Address of last doubleword.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	sld	r10,r10,r6
+#else
+	sld	r10,r10,r6
+	srd	r10,r10,r6
+#endif
+	cmpldi	cr7,r10,0
+	bne	cr7,L(done)
+
+	cmpld	r8,r7
+	beq	L(end_max)
+
+	.p2align  5
+L(loop_small):
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r0
+	cmpldi	cr6,r10,0
+	bne	cr6,L(done)
+	cmpld	r8,r7
+	bne	L(loop_small)
+	mr	r3,r4
+	blr
+
+END (STRNLEN)
+libc_hidden_def (__strnlen)
+weak_alias (__strnlen, strnlen)
+libc_hidden_def (strnlen)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strrchr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strrchr.S
new file mode 100644
index 0000000000..c22393deb5
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strrchr.S
@@ -0,0 +1,260 @@
+/* Optimized strrchr implementation for PowerPC64/POWER7 using cmpb insn.
+   Copyright (C) 2014-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* int [r3] strrchr (char *s [r3], int c [r4])  */
+
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+	.machine  power7
+ENTRY (STRRCHR)
+	CALL_MCOUNT 2
+	dcbt	0,r3
+	clrrdi	r8,r3,3	      /* Align the address to doubleword boundary.  */
+	cmpdi	cr7,r4,0
+	ld	r12,0(r8)     /* Load doubleword from memory.  */
+	li	r9,0	      /* used to store last occurence */
+	li	r0,0	      /* Doubleword with null chars to use
+				 with cmpb.  */
+
+	rlwinm	r6,r3,3,26,28 /* Calculate padding.  */
+
+	beq	cr7,L(null_match)
+
+	/* Replicate byte to doubleword.  */
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
+	insrdi	r4,r4,32,0
+
+	/* r4 is changed now ,if its passed as more chars
+	   check for null again */
+	cmpdi	cr7,r4,0
+	beq	cr7,L(null_match)
+	/* Now r4 has a doubleword of c bytes and r0 has
+	   a doubleword of null bytes.  */
+
+	cmpb	r10,r12,r4     /* Compare each byte against c byte.  */
+	cmpb	r11,r12,r0     /* Compare each byte against null byte.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+#else
+	sld	r10,r10,r6
+	sld	r11,r11,r6
+	srd	r10,r10,r6
+	srd	r11,r11,r6
+#endif
+	or	r5,r10,r11    /* OR the results to speed things up.  */
+	cmpdi	cr7,r5,0      /* If r5 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done)
+
+L(align):
+	mtcrf	0x01,r8
+
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	or	r5,r10,r11
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done)
+	b	L(loop)	      /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+	.p2align  5
+L(loop):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r7,16(r8)
+	cmpb	r10,r12,r4
+	cmpb	r11,r12,r0
+	cmpb	r6,r7,r4
+	cmpb	r7,r7,r0
+	or	r12,r10,r11
+	or	r5,r6,r7
+	or	r5,r12,r5
+	cmpdi	cr7,r5,0
+	beq	cr7,L(loop)
+
+	/* OK, one (or both) of the doublewords contains a c/null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a c/null byte.  */
+	cmpdi	cr6,r12,0
+	addi	r8,r8,-8
+	bne	cr6,L(done)
+
+	/* The c/null byte must be in the second doubleword.  Adjust the
+	   address again and move the result of cmpb to r10 so we can calculate
+	   the pointer.  */
+
+	mr	r10,r6
+	mr	r11,r7
+	addi	r8,r8,8
+
+	/* r10/r11 have the output of the cmpb instructions, that is,
+	   0xff in the same position as the c/null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+
+L(done):
+	/* if there are more than one 0xff in r11, find the first pos of ff
+	   in r11 and fill r10 with 0 from that position */
+	cmpdi	cr7,r11,0
+	beq	cr7,L(no_null)
+#ifdef __LITTLE_ENDIAN__
+	addi	r3,r11,-1
+	andc	r3,r3,r11
+	popcntd r0,r3
+#else
+	cntlzd	r0,r11
+#endif
+	subfic	r0,r0,63
+	li	r6,-1
+#ifdef __LITTLE_ENDIAN__
+	srd	r0,r6,r0
+#else
+	sld	r0,r6,r0
+#endif
+	and	r10,r0,r10
+L(no_null):
+#ifdef __LITTLE_ENDIAN__
+	cntlzd	r0,r10		/* Count leading zeros before c matches.  */
+	addi	r3,r10,-1
+	andc	r3,r3,r10
+	addi	r10,r11,-1
+	andc	r10,r10,r11
+	cmpld	cr7,r3,r10
+	bgt	cr7,L(no_match)
+#else
+	addi	r3,r10,-1	/* Count trailing zeros before c matches.  */
+	andc	r3,r3,r10
+	popcntd	r0,r3
+	cmpld	cr7,r11,r10
+	bgt	cr7,L(no_match)
+#endif
+	srdi	r0,r0,3		/* Convert trailing zeros to bytes.  */
+	subfic	r0,r0,7
+	add	r9,r8,r0      /* Return address of the matching c byte
+				 or null in case c was not found.  */
+	li	r0,0
+	cmpdi	cr7,r11,0     /* If r11 == 0, no null's have been found.  */
+	beq	cr7,L(align)
+
+	.align	4
+L(no_match):
+	mr	r3,r9
+	blr
+
+/* We are here because strrchr was called with a null byte.  */
+	.align	4
+L(null_match):
+	/* r0 has a doubleword of null bytes.  */
+
+	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
+
+	/* Move the doublewords left and right to discard the bits that are
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r5,r5,r6
+	sld	r5,r5,r6
+#else
+	sld	r5,r5,r6
+	srd	r5,r5,r6
+#endif
+	cmpdi	cr7,r5,0      /* If r10 == 0, no c or null bytes
+				 have been found.  */
+	bne	cr7,L(done_null)
+
+	mtcrf	0x01,r8
+
+	/* Are we now aligned to a quadword boundary?  If so, skip to
+	   the main loop.  Otherwise, go through the alignment code.  */
+
+	bt	28,L(loop_null)
+
+	/* Handle WORD2 of pair.  */
+	ldu	r12,8(r8)
+	cmpb	r5,r12,r0
+	cmpdi	cr7,r5,0
+	bne	cr7,L(done_null)
+	b	L(loop_null)  /* We branch here (rather than falling through)
+				 to skip the nops due to heavy alignment
+				 of the loop below.  */
+
+	/* Main loop to look for the end of the string.  Since it's a
+	   small loop (< 8 instructions), align it to 32-bytes.  */
+	.p2align  5
+L(loop_null):
+	/* Load two doublewords, compare and merge in a
+	   single register for speed.  This is an attempt
+	   to speed up the null-checking process for bigger strings.  */
+	ld	r12,8(r8)
+	ldu	r11,16(r8)
+	cmpb	r5,r12,r0
+	cmpb	r10,r11,r0
+	or	r6,r5,r10
+	cmpdi	cr7,r6,0
+	beq	cr7,L(loop_null)
+
+	/* OK, one (or both) of the doublewords contains a null byte.  Check
+	   the first doubleword and decrement the address in case the first
+	   doubleword really contains a null byte.  */
+
+	cmpdi	cr6,r5,0
+	addi	r8,r8,-8
+	bne	cr6,L(done_null)
+
+	/* The null byte must be in the second doubleword.  Adjust the address
+	   again and move the result of cmpb to r10 so we can calculate the
+	   pointer.  */
+
+	mr	r5,r10
+	addi	r8,r8,8
+
+	/* r5 has the output of the cmpb instruction, that is, it contains
+	   0xff in the same position as the null byte in the original
+	   doubleword from the string.  Use that to calculate the pointer.  */
+L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r5,-1
+	andc	r0,r0,r5
+	popcntd	r0,r0
+#else
+	cntlzd	r0,r5	      /* Count leading zeros before the match.  */
+#endif
+	srdi	r0,r0,3	      /* Convert trailing zeros to bytes.  */
+	add	r3,r8,r0      /* Return address of the matching null byte.  */
+	blr
+END (STRRCHR)
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c
new file mode 100644
index 0000000000..a917b2157e
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr-ppc64.c
@@ -0,0 +1,27 @@
+/* Optimized strstr implementation for PowerPC64/POWER7.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+
+#define STRSTR __strstr_ppc
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(__name)
+
+extern __typeof (strstr) __strstr_ppc attribute_hidden;
+
+#include <string/strstr.c>
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr.S
new file mode 100644
index 0000000000..260db2ed6d
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/strstr.S
@@ -0,0 +1,521 @@
+/* Optimized strstr implementation for PowerPC64/POWER7.
+   Copyright (C) 2015-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Char * [r3] strstr (char *s [r3], char * pat[r4])  */
+
+/* The performance gain is obtained using aligned memory access, load
+ * doubleword and usage of cmpb instruction for quicker comparison.  */
+
+#define ITERATIONS	64
+
+#ifndef STRSTR
+# define STRSTR strstr
+#endif
+
+#ifndef STRLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRLEN   __GI_strlen
+# else
+#  define STRLEN   strlen
+# endif
+#endif
+
+#ifndef STRNLEN
+/* For builds with no IFUNC support, local calls should be made to internal
+   GLIBC symbol (created by libc_hidden_builtin_def).  */
+# ifdef SHARED
+#  define STRNLEN   __GI_strnlen
+# else
+#  define STRNLEN  __strnlen
+# endif
+#endif
+
+#ifndef STRCHR
+# ifdef SHARED
+#  define STRCHR   __GI_strchr
+# else
+#  define STRCHR   strchr
+# endif
+#endif
+
+#define	FRAMESIZE	(FRAME_MIN_SIZE+32)
+	.machine  power7
+EALIGN (STRSTR, 4, 0)
+	CALL_MCOUNT 2
+	mflr	r0			/* Load link register LR to r0.  */
+	std	r31, -8(r1)		/* Save callers register r31.  */
+	std	r30, -16(r1)		/* Save callers register r30.  */
+	std	r29, -24(r1)		/* Save callers register r29.  */
+	std	r28, -32(r1)		/* Save callers register r28.  */
+	std	r0, 16(r1)		/* Store the link register.  */
+	cfi_offset(r31, -8)
+	cfi_offset(r30, -16)
+	cfi_offset(r28, -32)
+	cfi_offset(r29, -24)
+	cfi_offset(lr, 16)
+	stdu	r1, -FRAMESIZE(r1)	/* Create the stack frame.  */
+	cfi_adjust_cfa_offset(FRAMESIZE)
+
+	dcbt	0, r3
+	dcbt	0, r4
+	cmpdi	cr7, r3, 0
+	beq	cr7, L(retnull)
+	cmpdi	cr7, r4, 0
+	beq	cr7, L(retnull)
+
+	mr	r29, r3
+	mr	r30, r4
+	mr	r3, r4
+	bl	STRLEN
+	nop
+
+	cmpdi	cr7, r3, 0	/* If search str is null.  */
+	beq	cr7, L(ret_r3)
+
+	mr	r31, r3
+	mr	r4, r3
+	mr	r3, r29
+	bl	STRNLEN
+	nop
+
+	cmpd	cr7, r3, r31 	/* If len(r3) < len(r4).  */
+	blt	cr7, L(retnull)
+	mr	r3, r29
+	lbz	r4, 0(r30)
+	bl	STRCHR
+	nop
+
+	mr	r11, r3
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+	/* Reg r28 is used to count the number of iterations. */
+	li	r28, 0
+	rldicl	r8, r3, 0, 52	/* Page cross check.  */
+	cmpldi	cr7, r8, 4096-16
+	bgt	cr7, L(bytebybyte)
+
+	rldicl	r8, r30, 0, 52
+	cmpldi	cr7, r8, 4096-16
+	bgt	cr7, L(bytebybyte)
+
+	/* If len(r4) < 8 handle in a different way.  */
+	/* Shift position based on null and use cmpb.  */
+	cmpdi	cr7, r31, 8
+	blt	cr7, L(lessthan8)
+
+	/* Len(r4) >= 8 reaches here.  */
+	mr	r8, r3		/* Save r3 for future use.  */
+	mr	r4, r30		/* Restore r4.  */
+	li	r0, 0
+	rlwinm	r10, r30, 3, 26, 28	/* Calculate padding in bits.  */
+	clrrdi	r4, r4, 3	/* Make r4 aligned to 8.  */
+	ld	r6, 0(r4)
+	addi	r4, r4, 8
+	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
+	beq	cr7, L(begin1)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits.  */
+#else
+	sld	r6, r6, r10
+#endif
+	ld	r9, 0(r4)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r9, r9, r10	/* Discard unwanted bits.  */
+#else
+	srd	r9, r9, r10
+#endif
+	or	r6, r6, r9	/* Form complete search str.  */
+L(begin1):
+	mr	r29, r6
+	rlwinm	r10, r3, 3, 26, 28
+	clrrdi	r3, r3, 3
+	ld	r5, 0(r3)
+	cmpb	r9, r0, r6	/* Check if input has null.  */
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(return3)
+	cmpb	r9, r0, r5	/* Check if input has null.  */
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r9, r10
+#else
+	sld	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+
+	li	r12, -8		/* Shift values.  */
+	li	r11, 72		/* Shift values.  */
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(nextbyte1)
+	mr	r12, r10
+	addi	r12, r12, -8
+	subfic	r11, r12, 64
+
+L(nextbyte1):
+	ldu	r7, 8(r3) 	/* Load next dw.  */
+	addi	r12, r12, 8	/* Shift one byte and compare.  */
+	addi	r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12	/* Rotate based on mask.  */
+	sld	r10, r7, r11
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+#endif
+	/* Form single dw from few bytes on first load and second load.  */
+	or	r10, r9, r10
+	/* Check for null in the formed dw.  */
+	cmpb	r9, r0, r10
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	/* Cmpb search str and input str.  */
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	beq	cr7, L(match)
+	addi	r8, r8, 1
+	b	L(begin)
+
+	.align	4
+L(match):
+	/* There is a match of 8 bytes, check next bytes.  */
+	cmpdi	cr7, r31, 8
+	beq	cr7, L(return)
+	/* Update next starting point r8.  */
+	srdi	r9, r11, 3
+	subf	r9, r9, r3
+	mr	r8, r9
+
+L(secondmatch):
+	mr	r5, r7
+	rlwinm	r10, r30, 3, 26, 28	/* Calculate padding in bits.  */
+	ld	r6, 0(r4)
+	addi	r4, r4, 8
+	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
+	beq	cr7, L(proceed3)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits.  */
+	cmpb	r9, r0, r6
+	sld	r9, r9, r10
+#else
+	sld	r6, r6, r10
+	cmpb	r9, r0, r6
+	srd	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(proceed3)
+	ld	r9, 0(r4)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r9, r9, r10	/* Discard unwanted bits.  */
+#else
+	srd	r9, r9, r10
+#endif
+	or	r6, r6, r9	/* Form complete search str.  */
+
+L(proceed3):
+	li	r7, 0
+	addi	r3, r3, 8
+	cmpb	r9, r0, r5
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(proceed4)
+	ld	r7, 0(r3)
+L(proceed4):
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12
+	sld	r10, r7, r11
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+#endif
+	/* Form single dw with few bytes from first and second load.  */
+	or	r10, r9, r10
+	cmpb	r9, r0, r6
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(return4)
+	/* Check for null in the formed dw.  */
+	cmpb	r9, r0, r10
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	/* If the next 8 bytes dont match, start search again.  */
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	bne	cr7, L(reset)
+	/* If the next 8 bytes match, load and compare next 8.  */
+	b	L(secondmatch)
+
+	.align	4
+L(reset):
+	/* Start the search again.  */
+	addi	r8, r8, 1
+	b	L(begin)
+
+	.align	4
+L(return3):
+	/* Count leading zeros and compare partial dw.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r7, r9, -1
+	andc	r7, r7, r9
+	popcntd	r7, r7
+	subfic	r7, r7, 64
+	sld	r10, r5, r7
+	sld	r6, r6, r7
+#else
+	cntlzd	r7, r9
+	subfic	r7, r7, 64
+	srd	r10, r5, r7
+	srd	r6, r6, r7
+#endif
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	addi	r8, r8, 1
+	/* Start search again if there is no match.  */
+	bne	cr7, L(begin)
+	/* If the words match, update return values.  */
+	subfic	r7, r7, 64
+	srdi	r7, r7, 3
+	add	r3, r3, r7
+	subf	r3, r31, r3
+	b	L(end)
+
+	.align	4
+L(return4):
+	/* Count leading zeros and compare partial dw.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r7, r9, -1
+	andc	r7, r7, r9
+	popcntd	r7, r7
+	subfic	r7, r7, 64
+	sld	r10, r10, r7
+	sld	r6, r6, r7
+#else
+	cntlzd	r7, r9
+	subfic	r7, r7, 64
+	srd	r10, r10, r7
+	srd	r6, r6, r7
+#endif
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	addi	r8, r8, 1
+	bne	cr7, L(begin)
+	subfic	r7, r7, 64
+	srdi	r11, r11, 3
+	subf	r3, r11, r3
+	srdi	r7, r7, 3
+	add	r3, r3, r7
+	subf	r3, r31, r3
+	b	L(end)
+
+	.align	4
+L(begin):
+	mr	r3, r8
+	/* When our iterations exceed ITERATIONS,fall back to default. */
+	addi	r28, r28, 1
+	cmpdi	cr7, r28, ITERATIONS
+	beq	cr7, L(default)
+	lbz	r4, 0(r30)
+	bl	STRCHR
+	nop
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+	mr	r8, r3
+	mr	r4, r30		/* Restore r4.  */
+	li	r0, 0
+	mr	r6, r29
+	clrrdi	r4, r4, 3
+	addi	r4, r4, 8
+	b	L(begin1)
+
+	/* Handle less than 8 search string.  */
+	.align	4
+L(lessthan8):
+	mr	r4, r3
+	mr	r9, r30
+	li	r0, 0
+
+	rlwinm	r10, r9, 3, 26, 28	/* Calculate padding in bits.  */
+	srdi	r8, r10, 3	/* Padding in bytes.  */
+	clrrdi	r9, r9, 3	/* Make r4 aligned to 8.  */
+	ld	r6, 0(r9)
+	cmpdi	cr7, r10, 0	/* Check if its already aligned?  */
+	beq	cr7, L(proceed2)
+#ifdef __LITTLE_ENDIAN__
+	srd	r6, r6, r10	/* Discard unwanted bits.  */
+#else
+	sld	r6, r6, r10
+#endif
+	subfic	r8, r8, 8
+	cmpd	cr7, r8, r31	/* Next load needed?  */
+	bge	cr7, L(proceed2)
+	ld	r7, 8(r9)
+	subfic	r10, r10, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r7, r7, r10	/* Discard unwanted bits.  */
+#else
+	srd	r7, r7, r10
+#endif
+	or	r6, r6, r7	/* Form complete search str.  */
+L(proceed2):
+	mr	r29, r6
+	rlwinm	r10, r3, 3, 26, 28
+	clrrdi	r7, r3, 3	/* Make r3 aligned.  */
+	ld	r5, 0(r7)
+	sldi	r8, r31, 3
+	subfic	r8, r8, 64
+#ifdef __LITTLE_ENDIAN__
+	sld	r6, r6, r8
+	cmpb	r9, r0, r5
+	srd	r9, r9, r10
+#else
+	srd	r6, r6, r8
+	cmpb	r9, r0, r5
+	sld	r9, r9, r10
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(noload)
+	cmpdi	cr7, r10, 0
+	beq	cr7, L(continue)
+	ld	r7, 8(r7)
+L(continue1):
+	mr	r12, r10
+	addi	r12, r12, -8
+	subfic	r11, r12, 64
+	b	L(nextbyte)
+
+	.align	4
+L(continue):
+	ld	r7, 8(r7)
+	li	r12, -8		/* Shift values.  */
+	li	r11, 72		/* Shift values.  */
+L(nextbyte):
+	addi	r12, r12, 8	/* Mask for rotation.  */
+	addi	r11, r11, -8
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, r5, r12
+	sld	r10, r7, r11
+	or	r10, r9, r10
+	sld	r10, r10, r8
+	cmpb	r9, r0, r10
+	srd	r9, r9, r8
+#else
+	sld	r9, r5, r12
+	srd	r10, r7, r11
+	or	r10, r9, r10
+	srd	r10, r10, r8
+	cmpb	r9, r0, r10
+	sld	r9, r9, r8
+#endif
+	cmpdi	cr7, r9, 0
+	bne	cr7, L(retnull)
+	cmpb	r9, r10, r6
+	cmpdi	cr7, r9, -1
+	beq	cr7, L(end)
+	addi	r3, r4, 1
+	/* When our iterations exceed ITERATIONS,fall back to default. */
+	addi	r28, r28, 1
+	cmpdi	cr7, r28, ITERATIONS
+	beq	cr7, L(default)
+	lbz	r4, 0(r30)
+	bl	STRCHR
+	nop
+	/* If first char of search str is not present.  */
+	cmpdi	cr7, r3, 0
+	ble	cr7, L(end)
+	mr	r4, r3
+	mr	r6, r29
+	li	r0, 0
+	b	L(proceed2)
+
+	.align	4
+L(noload):
+	/* Reached null in r3, so skip next load.  */
+	li 	r7, 0
+	b	L(continue1)
+
+	.align	4
+L(return):
+	/* Update return values.  */
+	srdi	r9, r11, 3
+	subf	r3, r9, r3
+	b	L(end)
+
+	/* Handling byte by byte.  */
+	.align	4
+L(bytebybyte):
+	mr	r8, r3
+	addi	r8, r8, -1
+L(loop1):
+	addi	r8, r8, 1
+	mr	r3, r8
+	mr	r4, r30
+	lbz	r6, 0(r4)
+	cmpdi	cr7, r6, 0
+	beq	cr7, L(updater3)
+L(loop):
+	lbz	r5, 0(r3)
+	cmpdi	cr7, r5, 0
+	beq	cr7, L(retnull)
+	cmpld	cr7, r6, r5
+	bne	cr7, L(loop1)
+	addi	r3, r3, 1
+	addi	r4, r4, 1
+	lbz	r6, 0(r4)
+	cmpdi	cr7, r6, 0
+	beq	cr7, L(updater3)
+	b	L(loop)
+
+	/* Handling return values.  */
+	.align	4
+L(updater3):
+	subf	r3, r31, r3	/* Reduce len of r4 from r3.  */
+	b	L(end)
+
+	.align	4
+L(ret_r3):
+	mr	r3, r29		/* Return r3.  */
+	b	L(end)
+
+	.align	4
+L(retnull):
+	li	r3, 0		/* Return NULL.  */
+	b	L(end)
+
+	.align	4
+L(default):
+	mr	r4, r30
+	bl	__strstr_ppc
+	nop
+
+	.align	4
+L(end):
+	addi	r1, r1, FRAMESIZE	/* Restore stack pointer.  */
+	cfi_adjust_cfa_offset(-FRAMESIZE)
+	ld	r0, 16(r1)	/* Restore the saved link register.  */
+	ld	r28, -32(r1)	/* Restore callers save register r28.  */
+	ld	r29, -24(r1)	/* Restore callers save register r29.  */
+	ld	r30, -16(r1)	/* Restore callers save register r30.  */
+	ld	r31, -8(r1)	/* Restore callers save register r31.  */
+	mtlr	r0		/* Branch to link register.  */
+	blr
+END (STRSTR)
+libc_hidden_builtin_def (strstr)
diff --git a/REORG.TODO/sysdeps/powerpc/powerpc64/power7/sub_n.S b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/sub_n.S
new file mode 100644
index 0000000000..848dad5718
--- /dev/null
+++ b/REORG.TODO/sysdeps/powerpc/powerpc64/power7/sub_n.S
@@ -0,0 +1,23 @@
+/* PowerPC64 mpn_lshift --  mpn_add_n/mpn_sub_n -- mpn addition and
+   subtraction.
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define USE_AS_SUB
+#include "add_n.S"
author	Zack Weinberg <zackw@panix.com>	2017-06-08 15:39:03 -0400
committer	Zack Weinberg <zackw@panix.com>	2017-06-08 15:39:03 -0400
commit	5046dbb4a7eba5eccfd258f92f4735c9ffc8d069 (patch)
tree	4470480d904b65cf14ca524f96f79eca818c3eaf /REORG.TODO/sysdeps/powerpc/powerpc64/power7
parent	199fc19d3aaaf57944ef036e15904febe877fc93 (diff)
download	glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.gz glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.tar.xz glibc-5046dbb4a7eba5eccfd258f92f4735c9ffc8d069.zip